@@ -4,7 +4,7 @@ import { modelNameSchema } from '#validators/download'
44import { chatSchema , getAvailableModelsSchema } from '#validators/ollama'
55import { inject } from '@adonisjs/core'
66import type { HttpContext } from '@adonisjs/core/http'
7- import { SYSTEM_PROMPTS } from '../../constants/ollama.js'
7+ import { DEFAULT_QUERY_REWRITE_MODEL , SYSTEM_PROMPTS } from '../../constants/ollama.js'
88import logger from '@adonisjs/core/services/logger'
99import type { Message } from 'ollama'
1010
@@ -28,80 +28,85 @@ export default class OllamaController {
2828 async chat ( { request, response } : HttpContext ) {
2929 const reqData = await request . validateUsing ( chatSchema )
3030
31- // If there are no system messages in the chat inject system prompts
32- const hasSystemMessage = reqData . messages . some ( ( msg ) => msg . role === 'system' )
33- if ( ! hasSystemMessage ) {
34- const systemPrompt = {
35- role : 'system' as const ,
36- content : SYSTEM_PROMPTS . default ,
37- }
38- logger . debug ( '[OllamaController] Injecting system prompt' )
39- reqData . messages . unshift ( systemPrompt )
31+ // Flush SSE headers immediately so the client connection is open while
32+ // pre-processing (query rewriting, RAG lookup) runs in the background.
33+ if ( reqData . stream ) {
34+ response . response . setHeader ( 'Content-Type' , 'text/event-stream' )
35+ response . response . setHeader ( 'Cache-Control' , 'no-cache' )
36+ response . response . setHeader ( 'Connection' , 'keep-alive' )
37+ response . response . flushHeaders ( )
4038 }
4139
42- // Query rewriting for better RAG retrieval with manageable context
43- // Will return user's latest message if no rewriting is needed
44- const rewrittenQuery = await this . rewriteQueryWithContext (
45- reqData . messages ,
46- reqData . model
47- )
48-
49- logger . debug ( `[OllamaController] Rewritten query for RAG: "${ rewrittenQuery } "` )
50- if ( rewrittenQuery ) {
51- const relevantDocs = await this . ragService . searchSimilarDocuments (
52- rewrittenQuery ,
53- 5 , // Top 5 most relevant chunks
54- 0.3 // Minimum similarity score of 0.3
55- )
56-
57- logger . debug ( `[RAG] Retrieved ${ relevantDocs . length } relevant documents for query: "${ rewrittenQuery } "` )
58-
59- // If relevant context is found, inject as a system message
60- if ( relevantDocs . length > 0 ) {
61- const contextText = relevantDocs
62- . map ( ( doc , idx ) => `[Context ${ idx + 1 } ] (Relevance: ${ ( doc . score * 100 ) . toFixed ( 1 ) } %)\n${ doc . text } ` )
63- . join ( '\n\n' )
64-
65- const systemMessage = {
40+ try {
41+ // If there are no system messages in the chat inject system prompts
42+ const hasSystemMessage = reqData . messages . some ( ( msg ) => msg . role === 'system' )
43+ if ( ! hasSystemMessage ) {
44+ const systemPrompt = {
6645 role : 'system' as const ,
67- content : SYSTEM_PROMPTS . rag_context ( contextText ) ,
46+ content : SYSTEM_PROMPTS . default ,
6847 }
48+ logger . debug ( '[OllamaController] Injecting system prompt' )
49+ reqData . messages . unshift ( systemPrompt )
50+ }
6951
70- // Insert system message at the beginning (after any existing system messages)
71- const firstNonSystemIndex = reqData . messages . findIndex ( ( msg ) => msg . role !== 'system' )
72- const insertIndex = firstNonSystemIndex === - 1 ? 0 : firstNonSystemIndex
73- reqData . messages . splice ( insertIndex , 0 , systemMessage )
52+ // Query rewriting for better RAG retrieval with manageable context
53+ // Will return user's latest message if no rewriting is needed
54+ const rewrittenQuery = await this . rewriteQueryWithContext ( reqData . messages )
55+
56+ logger . debug ( `[OllamaController] Rewritten query for RAG: "${ rewrittenQuery } "` )
57+ if ( rewrittenQuery ) {
58+ const relevantDocs = await this . ragService . searchSimilarDocuments (
59+ rewrittenQuery ,
60+ 5 , // Top 5 most relevant chunks
61+ 0.3 // Minimum similarity score of 0.3
62+ )
63+
64+ logger . debug ( `[RAG] Retrieved ${ relevantDocs . length } relevant documents for query: "${ rewrittenQuery } "` )
65+
66+ // If relevant context is found, inject as a system message
67+ if ( relevantDocs . length > 0 ) {
68+ const contextText = relevantDocs
69+ . map ( ( doc , idx ) => `[Context ${ idx + 1 } ] (Relevance: ${ ( doc . score * 100 ) . toFixed ( 1 ) } %)\n${ doc . text } ` )
70+ . join ( '\n\n' )
71+
72+ const systemMessage = {
73+ role : 'system' as const ,
74+ content : SYSTEM_PROMPTS . rag_context ( contextText ) ,
75+ }
76+
77+ // Insert system message at the beginning (after any existing system messages)
78+ const firstNonSystemIndex = reqData . messages . findIndex ( ( msg ) => msg . role !== 'system' )
79+ const insertIndex = firstNonSystemIndex === - 1 ? 0 : firstNonSystemIndex
80+ reqData . messages . splice ( insertIndex , 0 , systemMessage )
81+ }
7482 }
75- }
7683
77- // Check if the model supports "thinking" capability for enhanced response generation
78- // If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
79- const thinkingCapability = await this . ollamaService . checkModelHasThinking ( reqData . model )
80- const think : boolean | 'medium' = thinkingCapability ? ( reqData . model . startsWith ( 'gpt-oss' ) ? 'medium' : true ) : false
81-
82- if ( reqData . stream ) {
83- logger . debug ( `[OllamaController] Initiating streaming response for model: "${ reqData . model } " with think: ${ think } ` )
84- // SSE streaming path
85- response . response . setHeader ( 'Content-Type' , 'text/event-stream' )
86- response . response . setHeader ( 'Cache-Control' , 'no-cache' )
87- response . response . setHeader ( 'Connection' , 'keep-alive' )
88- response . response . flushHeaders ( )
84+ // Check if the model supports "thinking" capability for enhanced response generation
85+ // If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
86+ const thinkingCapability = await this . ollamaService . checkModelHasThinking ( reqData . model )
87+ const think : boolean | 'medium' = thinkingCapability ? ( reqData . model . startsWith ( 'gpt-oss' ) ? 'medium' : true ) : false
8988
90- try {
89+ if ( reqData . stream ) {
90+ logger . debug ( `[OllamaController] Initiating streaming response for model: "${ reqData . model } " with think: ${ think } ` )
91+ // Headers already flushed above
9192 const stream = await this . ollamaService . chatStream ( { ...reqData , think } )
9293 for await ( const chunk of stream ) {
9394 response . response . write ( `data: ${ JSON . stringify ( chunk ) } \n\n` )
9495 }
95- } catch ( error ) {
96+ response . response . end ( )
97+ return
98+ }
99+
100+ // Non-streaming (legacy) path
101+ return await this . ollamaService . chat ( { ...reqData , think } )
102+ } catch ( error ) {
103+ if ( reqData . stream ) {
96104 response . response . write ( `data: ${ JSON . stringify ( { error : true } ) } \n\n` )
97- } finally {
98105 response . response . end ( )
106+ return
99107 }
100- return
108+ throw error
101109 }
102-
103- // Non-streaming (legacy) path
104- return await this . ollamaService . chat ( { ...reqData , think } )
105110 }
106111
107112 async deleteModel ( { request } : HttpContext ) {
@@ -127,17 +132,17 @@ export default class OllamaController {
127132 }
128133
129134 private async rewriteQueryWithContext (
130- messages : Message [ ] ,
131- model : string
135+ messages : Message [ ]
132136 ) : Promise < string | null > {
133137 try {
134138 // Get recent conversation history (last 6 messages for 3 turns)
135139 const recentMessages = messages . slice ( - 6 )
136140
137- // If there's only one user message, no rewriting needed
141+ // Skip rewriting for short conversations. Rewriting adds latency with
142+ // little RAG benefit until there is enough context to matter.
138143 const userMessages = recentMessages . filter ( msg => msg . role === 'user' )
139- if ( userMessages . length <= 1 ) {
140- return userMessages [ 0 ] ?. content || null
144+ if ( userMessages . length <= 2 ) {
145+ return userMessages [ userMessages . length - 1 ] ?. content || null
141146 }
142147
143148 const conversationContext = recentMessages
@@ -151,8 +156,17 @@ export default class OllamaController {
151156 } )
152157 . join ( '\n' )
153158
159+ const availableModels = await this . ollamaService . getAvailableModels ( { query : null , limit : 500 } )
160+ const rewriteModelAvailable = availableModels ?. models . some ( model => model . name === DEFAULT_QUERY_REWRITE_MODEL )
161+ if ( ! rewriteModelAvailable ) {
162+ logger . warn ( `[RAG] Query rewrite model "${ DEFAULT_QUERY_REWRITE_MODEL } " not available. Skipping query rewriting.` )
163+ const lastUserMessage = [ ...messages ] . reverse ( ) . find ( msg => msg . role === 'user' )
164+ return lastUserMessage ?. content || null
165+ }
166+
167+ // FUTURE ENHANCEMENT: allow the user to specify which model to use for rewriting
154168 const response = await this . ollamaService . chat ( {
155- model,
169+ model : DEFAULT_QUERY_REWRITE_MODEL ,
156170 messages : [
157171 {
158172 role : 'system' ,
0 commit comments