fix(AI): improved perf via rewrite and streaming logic

jakeaturner · jakeaturner · commit 00bd864831b7 · 2026-03-03T20:51:38.000-08:00
diff --git a/admin/app/controllers/ollama_controller.ts b/admin/app/controllers/ollama_controller.ts
@@ -4,7 +4,7 @@ import { modelNameSchema } from '#validators/download'
 import { chatSchema, getAvailableModelsSchema } from '#validators/ollama'
 import { inject } from '@adonisjs/core'
 import type { HttpContext } from '@adonisjs/core/http'
-import { SYSTEM_PROMPTS } from '../../constants/ollama.js'
+import { DEFAULT_QUERY_REWRITE_MODEL, SYSTEM_PROMPTS } from '../../constants/ollama.js'
 import logger from '@adonisjs/core/services/logger'
 import type { Message } from 'ollama'
 
@@ -28,80 +28,85 @@ export default class OllamaController {
   async chat({ request, response }: HttpContext) {
     const reqData = await request.validateUsing(chatSchema)
 
-    // If there are no system messages in the chat inject system prompts
-    const hasSystemMessage = reqData.messages.some((msg) => msg.role === 'system')
-    if (!hasSystemMessage) {
-      const systemPrompt = {
-        role: 'system' as const,
-        content: SYSTEM_PROMPTS.default,
-      }
-      logger.debug('[OllamaController] Injecting system prompt')
-      reqData.messages.unshift(systemPrompt)
+    // Flush SSE headers immediately so the client connection is open while
+    // pre-processing (query rewriting, RAG lookup) runs in the background.
+    if (reqData.stream) {
+      response.response.setHeader('Content-Type', 'text/event-stream')
+      response.response.setHeader('Cache-Control', 'no-cache')
+      response.response.setHeader('Connection', 'keep-alive')
+      response.response.flushHeaders()
     }
 
-    // Query rewriting for better RAG retrieval with manageable context
-    // Will return user's latest message if no rewriting is needed
-    const rewrittenQuery = await this.rewriteQueryWithContext(
-      reqData.messages,
-      reqData.model
-    )
-
-    logger.debug(`[OllamaController] Rewritten query for RAG: "${rewrittenQuery}"`)
-    if (rewrittenQuery) {
-      const relevantDocs = await this.ragService.searchSimilarDocuments(
-        rewrittenQuery,
-        5, // Top 5 most relevant chunks
-        0.3 // Minimum similarity score of 0.3
-      )
-
-      logger.debug(`[RAG] Retrieved ${relevantDocs.length} relevant documents for query: "${rewrittenQuery}"`)
-
-      // If relevant context is found, inject as a system message
-      if (relevantDocs.length > 0) {
-        const contextText = relevantDocs
-          .map((doc, idx) => `[Context ${idx + 1}] (Relevance: ${(doc.score * 100).toFixed(1)}%)\n${doc.text}`)
-          .join('\n\n')
-
-        const systemMessage = {
+    try {
+      // If there are no system messages in the chat inject system prompts
+      const hasSystemMessage = reqData.messages.some((msg) => msg.role === 'system')
+      if (!hasSystemMessage) {
+        const systemPrompt = {
           role: 'system' as const,
-          content: SYSTEM_PROMPTS.rag_context(contextText),
+          content: SYSTEM_PROMPTS.default,
         }
+        logger.debug('[OllamaController] Injecting system prompt')
+        reqData.messages.unshift(systemPrompt)
+      }
 
-        // Insert system message at the beginning (after any existing system messages)
-        const firstNonSystemIndex = reqData.messages.findIndex((msg) => msg.role !== 'system')
-        const insertIndex = firstNonSystemIndex === -1 ? 0 : firstNonSystemIndex
-        reqData.messages.splice(insertIndex, 0, systemMessage)
+      // Query rewriting for better RAG retrieval with manageable context
+      // Will return user's latest message if no rewriting is needed
+      const rewrittenQuery = await this.rewriteQueryWithContext(reqData.messages)
+
+      logger.debug(`[OllamaController] Rewritten query for RAG: "${rewrittenQuery}"`)
+      if (rewrittenQuery) {
+        const relevantDocs = await this.ragService.searchSimilarDocuments(
+          rewrittenQuery,
+          5, // Top 5 most relevant chunks
+          0.3 // Minimum similarity score of 0.3
+        )
+
+        logger.debug(`[RAG] Retrieved ${relevantDocs.length} relevant documents for query: "${rewrittenQuery}"`)
+
+        // If relevant context is found, inject as a system message
+        if (relevantDocs.length > 0) {
+          const contextText = relevantDocs
+            .map((doc, idx) => `[Context ${idx + 1}] (Relevance: ${(doc.score * 100).toFixed(1)}%)\n${doc.text}`)
+            .join('\n\n')
+
+          const systemMessage = {
+            role: 'system' as const,
+            content: SYSTEM_PROMPTS.rag_context(contextText),
+          }
+
+          // Insert system message at the beginning (after any existing system messages)
+          const firstNonSystemIndex = reqData.messages.findIndex((msg) => msg.role !== 'system')
+          const insertIndex = firstNonSystemIndex === -1 ? 0 : firstNonSystemIndex
+          reqData.messages.splice(insertIndex, 0, systemMessage)
+        }
       }
-    }
 
-    // Check if the model supports "thinking" capability for enhanced response generation
-    // If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
-    const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model)
-    const think: boolean | 'medium' = thinkingCapability ? (reqData.model.startsWith('gpt-oss') ? 'medium' : true) : false
-    
-    if (reqData.stream) {
-      logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`)
-      // SSE streaming path
-      response.response.setHeader('Content-Type', 'text/event-stream')
-      response.response.setHeader('Cache-Control', 'no-cache')
-      response.response.setHeader('Connection', 'keep-alive')
-      response.response.flushHeaders()
+      // Check if the model supports "thinking" capability for enhanced response generation
+      // If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
+      const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model)
+      const think: boolean | 'medium' = thinkingCapability ? (reqData.model.startsWith('gpt-oss') ? 'medium' : true) : false
 
-      try {
+      if (reqData.stream) {
+        logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`)
+        // Headers already flushed above
         const stream = await this.ollamaService.chatStream({ ...reqData, think })
         for await (const chunk of stream) {
           response.response.write(`data: ${JSON.stringify(chunk)}\n\n`)
         }
-      } catch (error) {
+        response.response.end()
+        return
+      }
+
+      // Non-streaming (legacy) path
+      return await this.ollamaService.chat({ ...reqData, think })
+    } catch (error) {
+      if (reqData.stream) {
         response.response.write(`data: ${JSON.stringify({ error: true })}\n\n`)
-      } finally {
         response.response.end()
+        return
       }
-      return
+      throw error
     }
-
-    // Non-streaming (legacy) path
-    return await this.ollamaService.chat({ ...reqData, think })
   }
 
   async deleteModel({ request }: HttpContext) {
@@ -127,17 +132,17 @@ export default class OllamaController {
   }
 
   private async rewriteQueryWithContext(
-    messages: Message[],
-    model: string
+    messages: Message[]
   ): Promise<string | null> {
     try {
       // Get recent conversation history (last 6 messages for 3 turns)
       const recentMessages = messages.slice(-6)
 
-      // If there's only one user message, no rewriting needed
+      // Skip rewriting for short conversations. Rewriting adds latency with
+      // little RAG benefit until there is enough context to matter.
       const userMessages = recentMessages.filter(msg => msg.role === 'user')
-      if (userMessages.length <= 1) {
-        return userMessages[0]?.content || null
+      if (userMessages.length <= 2) {
+        return userMessages[userMessages.length - 1]?.content || null
       }
 
       const conversationContext = recentMessages
@@ -151,8 +156,17 @@ export default class OllamaController {
         })
         .join('\n')
 
+      const availableModels = await this.ollamaService.getAvailableModels({ query: null, limit: 500 })
+      const rewriteModelAvailable = availableModels?.models.some(model => model.name === DEFAULT_QUERY_REWRITE_MODEL)
+      if (!rewriteModelAvailable) {
+        logger.warn(`[RAG] Query rewrite model "${DEFAULT_QUERY_REWRITE_MODEL}" not available. Skipping query rewriting.`)
+        const lastUserMessage = [...messages].reverse().find(msg => msg.role === 'user')
+        return lastUserMessage?.content || null
+      }
+
+      // FUTURE ENHANCEMENT: allow the user to specify which model to use for rewriting
       const response = await this.ollamaService.chat({
-        model,
+        model: DEFAULT_QUERY_REWRITE_MODEL,
         messages: [
           {
             role: 'system',
diff --git a/admin/constants/ollama.ts b/admin/constants/ollama.ts
@@ -62,6 +62,8 @@ export const FALLBACK_RECOMMENDED_OLLAMA_MODELS: NomadOllamaModel[] = [
   },
 ]
 
+export const DEFAULT_QUERY_REWRITE_MODEL = 'qwen2.5:3b' // default to qwen2.5 for query rewriting with good balance of text task performance and resource usage
+
 export const SYSTEM_PROMPTS = {
   default: `
  Format all responses using markdown for better readability. Vanilla markdown or GitHub-flavored markdown is preferred.
diff --git a/admin/inertia/components/chat/ChatInterface.tsx b/admin/inertia/components/chat/ChatInterface.tsx
@@ -5,6 +5,10 @@ import { ChatMessage } from '../../../types/chat'
 import ChatMessageBubble from './ChatMessageBubble'
 import ChatAssistantAvatar from './ChatAssistantAvatar'
 import BouncingDots from '../BouncingDots'
+import StyledModal from '../StyledModal'
+import api from '~/lib/api'
+import { DEFAULT_QUERY_REWRITE_MODEL } from '../../../constants/ollama'
+import { useNotifications } from '~/context/NotificationContext'
 
 interface ChatInterfaceProps {
   messages: ChatMessage[]
@@ -13,6 +17,7 @@ interface ChatInterfaceProps {
   chatSuggestions?: string[]
   chatSuggestionsEnabled?: boolean
   chatSuggestionsLoading?: boolean
+  rewriteModelAvailable?: boolean
 }
 
 export default function ChatInterface({
@@ -22,11 +27,28 @@ export default function ChatInterface({
   chatSuggestions = [],
   chatSuggestionsEnabled = false,
   chatSuggestionsLoading = false,
+  rewriteModelAvailable = false
 }: ChatInterfaceProps) {
+  const { addNotification } = useNotifications()
   const [input, setInput] = useState('')
+  const [downloadDialogOpen, setDownloadDialogOpen] = useState(false)
+  const [isDownloading, setIsDownloading] = useState(false)
   const messagesEndRef = useRef<HTMLDivElement>(null)
   const textareaRef = useRef<HTMLTextAreaElement>(null)
 
+  const handleDownloadModel = async () => {
+    setIsDownloading(true)
+    try {
+      await api.downloadModel(DEFAULT_QUERY_REWRITE_MODEL)
+      addNotification({ type: 'success', message: 'Model download queued' })
+    } catch (error) {
+      addNotification({ type: 'error', message: 'Failed to queue model download' })
+    } finally {
+      setIsDownloading(false)
+      setDownloadDialogOpen(false)
+    }
+  }
+
   const scrollToBottom = () => {
     messagesEndRef.current?.scrollIntoView({ behavior: 'smooth' })
   }
@@ -162,6 +184,36 @@ export default function ChatInterface({
             )}
           </button>
         </form>
+        {!rewriteModelAvailable && (
+          <div className="text-sm text-gray-500 mt-2">
+            The {DEFAULT_QUERY_REWRITE_MODEL} model is not installed. Consider{' '}
+            <button
+              onClick={() => setDownloadDialogOpen(true)}
+              className="text-desert-green underline hover:text-desert-green/80 cursor-pointer"
+            >
+              downloading it
+            </button>{' '}
+            for improved retrieval-augmented generation (RAG) performance.
+          </div>
+        )}
+        <StyledModal
+          open={downloadDialogOpen}
+          title={`Download ${DEFAULT_QUERY_REWRITE_MODEL}?`}
+          confirmText="Download"
+          cancelText="Cancel"
+          confirmIcon='IconDownload'
+          confirmVariant='primary'
+          confirmLoading={isDownloading}
+          onConfirm={handleDownloadModel}
+          onCancel={() => setDownloadDialogOpen(false)}
+          onClose={() => setDownloadDialogOpen(false)}
+        >
+          <p className="text-gray-700">
+            This will dispatch a background download job for{' '}
+            <span className="font-mono font-medium">{DEFAULT_QUERY_REWRITE_MODEL}</span> and may take some time to complete. The model
+            will be used to rewrite queries for improved RAG retrieval performance.
+          </p>
+        </StyledModal>
       </div>
     </div>
   )
diff --git a/admin/inertia/components/chat/ChatMessageBubble.tsx b/admin/inertia/components/chat/ChatMessageBubble.tsx
@@ -29,7 +29,9 @@ export default function ChatMessageBubble({ message }: ChatMessageBubbleProps) {
       {!message.isThinking && message.thinking && (
         <details className="mb-3 rounded border border-gray-200 bg-gray-50 text-xs">
           <summary className="cursor-pointer px-3 py-2 font-medium text-gray-500 hover:text-gray-700 select-none">
-            Reasoning
+            {message.thinkingDuration !== undefined
+              ? `Thought for ${message.thinkingDuration}s`
+              : 'Reasoning'}
           </summary>
           <div className="px-3 pb-3 prose prose-xs max-w-none text-gray-600 max-h-48 overflow-y-auto border-t border-gray-200 pt-2">
             <ReactMarkdown remarkPlugins={[remarkGfm]}>{message.thinking}</ReactMarkdown>
diff --git a/admin/inertia/components/chat/index.tsx b/admin/inertia/components/chat/index.tsx
@@ -1,4 +1,4 @@
-import { useState, useCallback, useEffect, useRef } from 'react'
+import { useState, useCallback, useEffect, useRef, useMemo } from 'react'
 import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'
 import ChatSidebar from './ChatSidebar'
 import ChatInterface from './ChatInterface'
@@ -9,6 +9,7 @@ import { useModals } from '~/context/ModalContext'
 import { ChatMessage } from '../../../types/chat'
 import classNames from '~/lib/classNames'
 import { IconX } from '@tabler/icons-react'
+import { DEFAULT_QUERY_REWRITE_MODEL } from '../../../constants/ollama'
 
 interface ChatProps {
   enabled: boolean
@@ -68,6 +69,10 @@ export default function Chat({
     refetchOnMount: false,
   })
 
+  const rewriteModelAvailable = useMemo(() => {
+    return installedModels.some(model => model.name === DEFAULT_QUERY_REWRITE_MODEL)
+  }, [installedModels])
+
   const deleteAllSessionsMutation = useMutation({
     mutationFn: () => api.deleteAllChatSessions(),
     onSuccess: () => {
@@ -159,7 +164,7 @@ export default function Chat({
     async (sessionId: string) => {
       // Cancel any ongoing suggestions fetch
       queryClient.cancelQueries({ queryKey: ['chatSuggestions'] })
-      
+
       setActiveSessionId(sessionId)
       // Load messages for this session
       const sessionData = await api.getChatSession(sessionId)
@@ -230,11 +235,16 @@ export default function Chat({
         let fullContent = ''
         let thinkingContent = ''
         let isThinkingPhase = true
+        let thinkingStartTime: number | null = null
+        let thinkingDuration: number | null = null
 
         try {
           await api.streamChatMessage(
             { model: selectedModel || 'llama3.2', messages: chatMessages, stream: true },
             (chunkContent, chunkThinking, done) => {
+              if (chunkThinking.length > 0 && thinkingStartTime === null) {
+                thinkingStartTime = Date.now()
+              }
               if (isFirstChunk) {
                 isFirstChunk = false
                 setIsStreamingResponse(false)
@@ -248,22 +258,27 @@ export default function Chat({
                     timestamp: new Date(),
                     isStreaming: true,
                     isThinking: chunkThinking.length > 0 && chunkContent.length === 0,
+                    thinkingDuration: undefined,
                   },
                 ])
               } else {
                 if (isThinkingPhase && chunkContent.length > 0) {
                   isThinkingPhase = false
+                  if (thinkingStartTime !== null) {
+                    thinkingDuration = Math.max(1, Math.round((Date.now() - thinkingStartTime) / 1000))
+                  }
                 }
                 setMessages((prev) =>
                   prev.map((m) =>
                     m.id === assistantMsgId
                       ? {
-                          ...m,
-                          content: m.content + chunkContent,
-                          thinking: (m.thinking ?? '') + chunkThinking,
-                          isStreaming: !done,
-                          isThinking: isThinkingPhase,
-                        }
+                        ...m,
+                        content: m.content + chunkContent,
+                        thinking: (m.thinking ?? '') + chunkThinking,
+                        isStreaming: !done,
+                        isThinking: isThinkingPhase,
+                        thinkingDuration: thinkingDuration ?? undefined,
+                      }
                       : m
                   )
                 )
@@ -391,6 +406,7 @@ export default function Chat({
           chatSuggestions={chatSuggestions}
           chatSuggestionsEnabled={suggestionsEnabled}
           chatSuggestionsLoading={chatSuggestionsLoading}
+          rewriteModelAvailable={rewriteModelAvailable}
         />
       </div>
     </div>
diff --git a/admin/types/chat.ts b/admin/types/chat.ts

Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,8 @@ export const FALLBACK_RECOMMENDED_OLLAMA_MODELS: NomadOllamaModel[] = [`
`62`	`62`	`},`
`63`	`63`	`]`
`64`	`64`
	`65`	`+export const DEFAULT_QUERY_REWRITE_MODEL = 'qwen2.5:3b' // default to qwen2.5 for query rewriting with good balance of text task performance and resource usage`
	`66`	`+`
`65`	`67`	`export const SYSTEM_PROMPTS = {`
`66`	`68`	default: `
`67`	`69`	`Format all responses using markdown for better readability. Vanilla markdown or GitHub-flavored markdown is preferred.`