Skip to content

Commit 274351b

Browse files
committed
fix(AI): improved perf via rewrite and streaming logic
1 parent e319da6 commit 274351b

File tree

6 files changed

+161
-74
lines changed

6 files changed

+161
-74
lines changed

admin/app/controllers/ollama_controller.ts

Lines changed: 79 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import { modelNameSchema } from '#validators/download'
44
import { chatSchema, getAvailableModelsSchema } from '#validators/ollama'
55
import { inject } from '@adonisjs/core'
66
import type { HttpContext } from '@adonisjs/core/http'
7-
import { SYSTEM_PROMPTS } from '../../constants/ollama.js'
7+
import { DEFAULT_QUERY_REWRITE_MODEL, SYSTEM_PROMPTS } from '../../constants/ollama.js'
88
import logger from '@adonisjs/core/services/logger'
99
import type { Message } from 'ollama'
1010

@@ -28,80 +28,85 @@ export default class OllamaController {
2828
async chat({ request, response }: HttpContext) {
2929
const reqData = await request.validateUsing(chatSchema)
3030

31-
// If there are no system messages in the chat inject system prompts
32-
const hasSystemMessage = reqData.messages.some((msg) => msg.role === 'system')
33-
if (!hasSystemMessage) {
34-
const systemPrompt = {
35-
role: 'system' as const,
36-
content: SYSTEM_PROMPTS.default,
37-
}
38-
logger.debug('[OllamaController] Injecting system prompt')
39-
reqData.messages.unshift(systemPrompt)
31+
// Flush SSE headers immediately so the client connection is open while
32+
// pre-processing (query rewriting, RAG lookup) runs in the background.
33+
if (reqData.stream) {
34+
response.response.setHeader('Content-Type', 'text/event-stream')
35+
response.response.setHeader('Cache-Control', 'no-cache')
36+
response.response.setHeader('Connection', 'keep-alive')
37+
response.response.flushHeaders()
4038
}
4139

42-
// Query rewriting for better RAG retrieval with manageable context
43-
// Will return user's latest message if no rewriting is needed
44-
const rewrittenQuery = await this.rewriteQueryWithContext(
45-
reqData.messages,
46-
reqData.model
47-
)
48-
49-
logger.debug(`[OllamaController] Rewritten query for RAG: "${rewrittenQuery}"`)
50-
if (rewrittenQuery) {
51-
const relevantDocs = await this.ragService.searchSimilarDocuments(
52-
rewrittenQuery,
53-
5, // Top 5 most relevant chunks
54-
0.3 // Minimum similarity score of 0.3
55-
)
56-
57-
logger.debug(`[RAG] Retrieved ${relevantDocs.length} relevant documents for query: "${rewrittenQuery}"`)
58-
59-
// If relevant context is found, inject as a system message
60-
if (relevantDocs.length > 0) {
61-
const contextText = relevantDocs
62-
.map((doc, idx) => `[Context ${idx + 1}] (Relevance: ${(doc.score * 100).toFixed(1)}%)\n${doc.text}`)
63-
.join('\n\n')
64-
65-
const systemMessage = {
40+
try {
41+
// If there are no system messages in the chat inject system prompts
42+
const hasSystemMessage = reqData.messages.some((msg) => msg.role === 'system')
43+
if (!hasSystemMessage) {
44+
const systemPrompt = {
6645
role: 'system' as const,
67-
content: SYSTEM_PROMPTS.rag_context(contextText),
46+
content: SYSTEM_PROMPTS.default,
6847
}
48+
logger.debug('[OllamaController] Injecting system prompt')
49+
reqData.messages.unshift(systemPrompt)
50+
}
6951

70-
// Insert system message at the beginning (after any existing system messages)
71-
const firstNonSystemIndex = reqData.messages.findIndex((msg) => msg.role !== 'system')
72-
const insertIndex = firstNonSystemIndex === -1 ? 0 : firstNonSystemIndex
73-
reqData.messages.splice(insertIndex, 0, systemMessage)
52+
// Query rewriting for better RAG retrieval with manageable context
53+
// Will return user's latest message if no rewriting is needed
54+
const rewrittenQuery = await this.rewriteQueryWithContext(reqData.messages)
55+
56+
logger.debug(`[OllamaController] Rewritten query for RAG: "${rewrittenQuery}"`)
57+
if (rewrittenQuery) {
58+
const relevantDocs = await this.ragService.searchSimilarDocuments(
59+
rewrittenQuery,
60+
5, // Top 5 most relevant chunks
61+
0.3 // Minimum similarity score of 0.3
62+
)
63+
64+
logger.debug(`[RAG] Retrieved ${relevantDocs.length} relevant documents for query: "${rewrittenQuery}"`)
65+
66+
// If relevant context is found, inject as a system message
67+
if (relevantDocs.length > 0) {
68+
const contextText = relevantDocs
69+
.map((doc, idx) => `[Context ${idx + 1}] (Relevance: ${(doc.score * 100).toFixed(1)}%)\n${doc.text}`)
70+
.join('\n\n')
71+
72+
const systemMessage = {
73+
role: 'system' as const,
74+
content: SYSTEM_PROMPTS.rag_context(contextText),
75+
}
76+
77+
// Insert system message at the beginning (after any existing system messages)
78+
const firstNonSystemIndex = reqData.messages.findIndex((msg) => msg.role !== 'system')
79+
const insertIndex = firstNonSystemIndex === -1 ? 0 : firstNonSystemIndex
80+
reqData.messages.splice(insertIndex, 0, systemMessage)
81+
}
7482
}
75-
}
7683

77-
// Check if the model supports "thinking" capability for enhanced response generation
78-
// If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
79-
const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model)
80-
const think: boolean | 'medium' = thinkingCapability ? (reqData.model.startsWith('gpt-oss') ? 'medium' : true) : false
81-
82-
if (reqData.stream) {
83-
logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`)
84-
// SSE streaming path
85-
response.response.setHeader('Content-Type', 'text/event-stream')
86-
response.response.setHeader('Cache-Control', 'no-cache')
87-
response.response.setHeader('Connection', 'keep-alive')
88-
response.response.flushHeaders()
84+
// Check if the model supports "thinking" capability for enhanced response generation
85+
// If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
86+
const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model)
87+
const think: boolean | 'medium' = thinkingCapability ? (reqData.model.startsWith('gpt-oss') ? 'medium' : true) : false
8988

90-
try {
89+
if (reqData.stream) {
90+
logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`)
91+
// Headers already flushed above
9192
const stream = await this.ollamaService.chatStream({ ...reqData, think })
9293
for await (const chunk of stream) {
9394
response.response.write(`data: ${JSON.stringify(chunk)}\n\n`)
9495
}
95-
} catch (error) {
96+
response.response.end()
97+
return
98+
}
99+
100+
// Non-streaming (legacy) path
101+
return await this.ollamaService.chat({ ...reqData, think })
102+
} catch (error) {
103+
if (reqData.stream) {
96104
response.response.write(`data: ${JSON.stringify({ error: true })}\n\n`)
97-
} finally {
98105
response.response.end()
106+
return
99107
}
100-
return
108+
throw error
101109
}
102-
103-
// Non-streaming (legacy) path
104-
return await this.ollamaService.chat({ ...reqData, think })
105110
}
106111

107112
async deleteModel({ request }: HttpContext) {
@@ -127,17 +132,17 @@ export default class OllamaController {
127132
}
128133

129134
private async rewriteQueryWithContext(
130-
messages: Message[],
131-
model: string
135+
messages: Message[]
132136
): Promise<string | null> {
133137
try {
134138
// Get recent conversation history (last 6 messages for 3 turns)
135139
const recentMessages = messages.slice(-6)
136140

137-
// If there's only one user message, no rewriting needed
141+
// Skip rewriting for short conversations. Rewriting adds latency with
142+
// little RAG benefit until there is enough context to matter.
138143
const userMessages = recentMessages.filter(msg => msg.role === 'user')
139-
if (userMessages.length <= 1) {
140-
return userMessages[0]?.content || null
144+
if (userMessages.length <= 2) {
145+
return userMessages[userMessages.length - 1]?.content || null
141146
}
142147

143148
const conversationContext = recentMessages
@@ -151,8 +156,17 @@ export default class OllamaController {
151156
})
152157
.join('\n')
153158

159+
const availableModels = await this.ollamaService.getAvailableModels({ query: null, limit: 500 })
160+
const rewriteModelAvailable = availableModels?.models.some(model => model.name === DEFAULT_QUERY_REWRITE_MODEL)
161+
if (!rewriteModelAvailable) {
162+
logger.warn(`[RAG] Query rewrite model "${DEFAULT_QUERY_REWRITE_MODEL}" not available. Skipping query rewriting.`)
163+
const lastUserMessage = [...messages].reverse().find(msg => msg.role === 'user')
164+
return lastUserMessage?.content || null
165+
}
166+
167+
// FUTURE ENHANCEMENT: allow the user to specify which model to use for rewriting
154168
const response = await this.ollamaService.chat({
155-
model,
169+
model: DEFAULT_QUERY_REWRITE_MODEL,
156170
messages: [
157171
{
158172
role: 'system',

admin/constants/ollama.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ export const FALLBACK_RECOMMENDED_OLLAMA_MODELS: NomadOllamaModel[] = [
6262
},
6363
]
6464

65+
export const DEFAULT_QUERY_REWRITE_MODEL = 'qwen2.5:3b' // default to qwen2.5 for query rewriting with good balance of text task performance and resource usage
66+
6567
export const SYSTEM_PROMPTS = {
6668
default: `
6769
Format all responses using markdown for better readability. Vanilla markdown or GitHub-flavored markdown is preferred.

admin/inertia/components/chat/ChatInterface.tsx

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ import { ChatMessage } from '../../../types/chat'
55
import ChatMessageBubble from './ChatMessageBubble'
66
import ChatAssistantAvatar from './ChatAssistantAvatar'
77
import BouncingDots from '../BouncingDots'
8+
import StyledModal from '../StyledModal'
9+
import api from '~/lib/api'
10+
import { DEFAULT_QUERY_REWRITE_MODEL } from '../../../constants/ollama'
11+
import { useNotifications } from '~/context/NotificationContext'
812

913
interface ChatInterfaceProps {
1014
messages: ChatMessage[]
@@ -13,6 +17,7 @@ interface ChatInterfaceProps {
1317
chatSuggestions?: string[]
1418
chatSuggestionsEnabled?: boolean
1519
chatSuggestionsLoading?: boolean
20+
rewriteModelAvailable?: boolean
1621
}
1722

1823
export default function ChatInterface({
@@ -22,11 +27,28 @@ export default function ChatInterface({
2227
chatSuggestions = [],
2328
chatSuggestionsEnabled = false,
2429
chatSuggestionsLoading = false,
30+
rewriteModelAvailable = false
2531
}: ChatInterfaceProps) {
32+
const { addNotification } = useNotifications()
2633
const [input, setInput] = useState('')
34+
const [downloadDialogOpen, setDownloadDialogOpen] = useState(false)
35+
const [isDownloading, setIsDownloading] = useState(false)
2736
const messagesEndRef = useRef<HTMLDivElement>(null)
2837
const textareaRef = useRef<HTMLTextAreaElement>(null)
2938

39+
const handleDownloadModel = async () => {
40+
setIsDownloading(true)
41+
try {
42+
await api.downloadModel(DEFAULT_QUERY_REWRITE_MODEL)
43+
addNotification({ type: 'success', message: 'Model download queued' })
44+
} catch (error) {
45+
addNotification({ type: 'error', message: 'Failed to queue model download' })
46+
} finally {
47+
setIsDownloading(false)
48+
setDownloadDialogOpen(false)
49+
}
50+
}
51+
3052
const scrollToBottom = () => {
3153
messagesEndRef.current?.scrollIntoView({ behavior: 'smooth' })
3254
}
@@ -162,6 +184,36 @@ export default function ChatInterface({
162184
)}
163185
</button>
164186
</form>
187+
{!rewriteModelAvailable && (
188+
<div className="text-sm text-gray-500 mt-2">
189+
The {DEFAULT_QUERY_REWRITE_MODEL} model is not installed. Consider{' '}
190+
<button
191+
onClick={() => setDownloadDialogOpen(true)}
192+
className="text-desert-green underline hover:text-desert-green/80 cursor-pointer"
193+
>
194+
downloading it
195+
</button>{' '}
196+
for improved retrieval-augmented generation (RAG) performance.
197+
</div>
198+
)}
199+
<StyledModal
200+
open={downloadDialogOpen}
201+
title={`Download ${DEFAULT_QUERY_REWRITE_MODEL}?`}
202+
confirmText="Download"
203+
cancelText="Cancel"
204+
confirmIcon='IconDownload'
205+
confirmVariant='primary'
206+
confirmLoading={isDownloading}
207+
onConfirm={handleDownloadModel}
208+
onCancel={() => setDownloadDialogOpen(false)}
209+
onClose={() => setDownloadDialogOpen(false)}
210+
>
211+
<p className="text-gray-700">
212+
This will dispatch a background download job for{' '}
213+
<span className="font-mono font-medium">{DEFAULT_QUERY_REWRITE_MODEL}</span> and may take some time to complete. The model
214+
will be used to rewrite queries for improved RAG retrieval performance.
215+
</p>
216+
</StyledModal>
165217
</div>
166218
</div>
167219
)

admin/inertia/components/chat/ChatMessageBubble.tsx

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@ export default function ChatMessageBubble({ message }: ChatMessageBubbleProps) {
2929
{!message.isThinking && message.thinking && (
3030
<details className="mb-3 rounded border border-gray-200 bg-gray-50 text-xs">
3131
<summary className="cursor-pointer px-3 py-2 font-medium text-gray-500 hover:text-gray-700 select-none">
32-
Reasoning
32+
{message.thinkingDuration !== undefined
33+
? `Thought for ${message.thinkingDuration}s`
34+
: 'Reasoning'}
3335
</summary>
3436
<div className="px-3 pb-3 prose prose-xs max-w-none text-gray-600 max-h-48 overflow-y-auto border-t border-gray-200 pt-2">
3537
<ReactMarkdown remarkPlugins={[remarkGfm]}>{message.thinking}</ReactMarkdown>

admin/inertia/components/chat/index.tsx

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { useState, useCallback, useEffect, useRef } from 'react'
1+
import { useState, useCallback, useEffect, useRef, useMemo } from 'react'
22
import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'
33
import ChatSidebar from './ChatSidebar'
44
import ChatInterface from './ChatInterface'
@@ -9,6 +9,7 @@ import { useModals } from '~/context/ModalContext'
99
import { ChatMessage } from '../../../types/chat'
1010
import classNames from '~/lib/classNames'
1111
import { IconX } from '@tabler/icons-react'
12+
import { DEFAULT_QUERY_REWRITE_MODEL } from '../../../constants/ollama'
1213

1314
interface ChatProps {
1415
enabled: boolean
@@ -68,6 +69,10 @@ export default function Chat({
6869
refetchOnMount: false,
6970
})
7071

72+
const rewriteModelAvailable = useMemo(() => {
73+
return installedModels.some(model => model.name === DEFAULT_QUERY_REWRITE_MODEL)
74+
}, [installedModels])
75+
7176
const deleteAllSessionsMutation = useMutation({
7277
mutationFn: () => api.deleteAllChatSessions(),
7378
onSuccess: () => {
@@ -159,7 +164,7 @@ export default function Chat({
159164
async (sessionId: string) => {
160165
// Cancel any ongoing suggestions fetch
161166
queryClient.cancelQueries({ queryKey: ['chatSuggestions'] })
162-
167+
163168
setActiveSessionId(sessionId)
164169
// Load messages for this session
165170
const sessionData = await api.getChatSession(sessionId)
@@ -230,11 +235,16 @@ export default function Chat({
230235
let fullContent = ''
231236
let thinkingContent = ''
232237
let isThinkingPhase = true
238+
let thinkingStartTime: number | null = null
239+
let thinkingDuration: number | null = null
233240

234241
try {
235242
await api.streamChatMessage(
236243
{ model: selectedModel || 'llama3.2', messages: chatMessages, stream: true },
237244
(chunkContent, chunkThinking, done) => {
245+
if (chunkThinking.length > 0 && thinkingStartTime === null) {
246+
thinkingStartTime = Date.now()
247+
}
238248
if (isFirstChunk) {
239249
isFirstChunk = false
240250
setIsStreamingResponse(false)
@@ -248,22 +258,27 @@ export default function Chat({
248258
timestamp: new Date(),
249259
isStreaming: true,
250260
isThinking: chunkThinking.length > 0 && chunkContent.length === 0,
261+
thinkingDuration: undefined,
251262
},
252263
])
253264
} else {
254265
if (isThinkingPhase && chunkContent.length > 0) {
255266
isThinkingPhase = false
267+
if (thinkingStartTime !== null) {
268+
thinkingDuration = Math.max(1, Math.round((Date.now() - thinkingStartTime) / 1000))
269+
}
256270
}
257271
setMessages((prev) =>
258272
prev.map((m) =>
259273
m.id === assistantMsgId
260274
? {
261-
...m,
262-
content: m.content + chunkContent,
263-
thinking: (m.thinking ?? '') + chunkThinking,
264-
isStreaming: !done,
265-
isThinking: isThinkingPhase,
266-
}
275+
...m,
276+
content: m.content + chunkContent,
277+
thinking: (m.thinking ?? '') + chunkThinking,
278+
isStreaming: !done,
279+
isThinking: isThinkingPhase,
280+
thinkingDuration: thinkingDuration ?? undefined,
281+
}
267282
: m
268283
)
269284
)
@@ -391,6 +406,7 @@ export default function Chat({
391406
chatSuggestions={chatSuggestions}
392407
chatSuggestionsEnabled={suggestionsEnabled}
393408
chatSuggestionsLoading={chatSuggestionsLoading}
409+
rewriteModelAvailable={rewriteModelAvailable}
394410
/>
395411
</div>
396412
</div>

0 commit comments

Comments
 (0)