@@ -5,7 +5,7 @@ import { modelNameSchema } from '#validators/download'
55import { chatSchema , getAvailableModelsSchema } from '#validators/ollama'
66import { inject } from '@adonisjs/core'
77import type { HttpContext } from '@adonisjs/core/http'
8- import { DEFAULT_QUERY_REWRITE_MODEL , SYSTEM_PROMPTS } from '../../constants/ollama.js'
8+ import { DEFAULT_QUERY_REWRITE_MODEL , RAG_CONTEXT_LIMITS , SYSTEM_PROMPTS } from '../../constants/ollama.js'
99import logger from '@adonisjs/core/services/logger'
1010import type { Message } from 'ollama'
1111
@@ -66,9 +66,28 @@ export default class OllamaController {
6666
6767 logger . debug ( `[RAG] Retrieved ${ relevantDocs . length } relevant documents for query: "${ rewrittenQuery } "` )
6868
69- // If relevant context is found, inject as a system message
69+ // If relevant context is found, inject as a system message with adaptive limits
7070 if ( relevantDocs . length > 0 ) {
71- const contextText = relevantDocs
71+ // Determine context budget based on model size
72+ const { maxResults, maxTokens } = this . getContextLimitsForModel ( reqData . model )
73+ let trimmedDocs = relevantDocs . slice ( 0 , maxResults )
74+
75+ // Apply token cap if set (estimate ~4 chars per token)
76+ // Always include the first (most relevant) result — the cap only gates subsequent results
77+ if ( maxTokens > 0 ) {
78+ const charCap = maxTokens * 4
79+ let totalChars = 0
80+ trimmedDocs = trimmedDocs . filter ( ( doc , idx ) => {
81+ totalChars += doc . text . length
82+ return idx === 0 || totalChars <= charCap
83+ } )
84+ }
85+
86+ logger . debug (
87+ `[RAG] Injecting ${ trimmedDocs . length } /${ relevantDocs . length } results (model: ${ reqData . model } , maxResults: ${ maxResults } , maxTokens: ${ maxTokens || 'unlimited' } )`
88+ )
89+
90+ const contextText = trimmedDocs
7291 . map ( ( doc , idx ) => `[Context ${ idx + 1 } ] (Relevance: ${ ( doc . score * 100 ) . toFixed ( 1 ) } %)\n${ doc . text } ` )
7392 . join ( '\n\n' )
7493
@@ -174,6 +193,25 @@ export default class OllamaController {
174193 return await this . ollamaService . getModels ( )
175194 }
176195
196+ /**
197+ * Determines RAG context limits based on model size extracted from the model name.
198+ * Parses size indicators like "1b", "3b", "8b", "70b" from model names/tags.
199+ */
200+ private getContextLimitsForModel ( modelName : string ) : { maxResults : number ; maxTokens : number } {
201+ // Extract parameter count from model name (e.g., "llama3.2:3b", "qwen2.5:1.5b", "gemma:7b")
202+ const sizeMatch = modelName . match ( / ( \d + \. ? \d * ) [ b B ] / )
203+ const paramBillions = sizeMatch ? parseFloat ( sizeMatch [ 1 ] ) : 8 // default to 8B if unknown
204+
205+ for ( const tier of RAG_CONTEXT_LIMITS ) {
206+ if ( paramBillions <= tier . maxParams ) {
207+ return { maxResults : tier . maxResults , maxTokens : tier . maxTokens }
208+ }
209+ }
210+
211+ // Fallback: no limits
212+ return { maxResults : 5 , maxTokens : 0 }
213+ }
214+
177215 private async rewriteQueryWithContext (
178216 messages : Message [ ]
179217 ) : Promise < string | null > {
@@ -199,8 +237,8 @@ export default class OllamaController {
199237 } )
200238 . join ( '\n' )
201239
202- const availableModels = await this . ollamaService . getAvailableModels ( { query : null , limit : 500 } )
203- const rewriteModelAvailable = availableModels ?. models . some ( model => model . name === DEFAULT_QUERY_REWRITE_MODEL )
240+ const installedModels = await this . ollamaService . getModels ( true )
241+ const rewriteModelAvailable = installedModels ? .some ( model => model . name === DEFAULT_QUERY_REWRITE_MODEL )
204242 if ( ! rewriteModelAvailable ) {
205243 logger . warn ( `[RAG] Query rewrite model "${ DEFAULT_QUERY_REWRITE_MODEL } " not available. Skipping query rewriting.` )
206244 const lastUserMessage = [ ...messages ] . reverse ( ) . find ( msg => msg . role === 'user' )
0 commit comments