Skip to content

Commit 4747863

Browse files
committed
feat(AI Assistant): allow manual scan and resync KB
1 parent 9301c44 commit 4747863

File tree

7 files changed

+233
-9
lines changed

7 files changed

+233
-9
lines changed

admin/app/controllers/ollama_controller.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,14 @@ export default class OllamaController {
2727
async chat({ request }: HttpContext) {
2828
const reqData = await request.validateUsing(chatSchema)
2929

30-
// If there are no system messages in the chat
31-
// (i.e. first message from the user) inject system prompts
30+
// If there are no system messages in the chat inject system prompts
3231
const hasSystemMessage = reqData.messages.some((msg) => msg.role === 'system')
3332
if (!hasSystemMessage) {
3433
const systemPrompt = {
3534
role: 'system' as const,
3635
content: SYSTEM_PROMPTS.default,
3736
}
37+
logger.debug('[OllamaController] Injecting system prompt')
3838
reqData.messages.unshift(systemPrompt)
3939
}
4040

@@ -45,6 +45,7 @@ export default class OllamaController {
4545
reqData.model
4646
)
4747

48+
logger.debug(`[OllamaController] Rewritten query for RAG: "${rewrittenQuery}"`)
4849
if (rewrittenQuery) {
4950
const relevantDocs = await this.ragService.searchSimilarDocuments(
5051
rewrittenQuery,

admin/app/controllers/rag_controller.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import { getJobStatusSchema } from '#validators/rag'
99

1010
@inject()
1111
export default class RagController {
12-
constructor(private ragService: RagService) {}
12+
constructor(private ragService: RagService) { }
1313

1414
public async upload({ request, response }: HttpContext) {
1515
const uploadedFile = request.file('file')
@@ -59,4 +59,13 @@ export default class RagController {
5959
const files = await this.ragService.getStoredFiles()
6060
return response.status(200).json({ files })
6161
}
62+
63+
public async scanAndSync({ response }: HttpContext) {
64+
try {
65+
const syncResult = await this.ragService.scanAndSyncStorage()
66+
return response.status(200).json(syncResult)
67+
} catch (error) {
68+
return response.status(500).json({ error: 'Error scanning and syncing storage', details: error.message })
69+
}
70+
}
6271
}

admin/app/services/rag_service.ts

Lines changed: 149 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import { inject } from '@adonisjs/core'
44
import logger from '@adonisjs/core/services/logger'
55
import { TokenChunker } from '@chonkiejs/core'
66
import sharp from 'sharp'
7-
import { deleteFileIfExists, determineFileType, getFile, getFileStatsIfExists, listDirectoryContentsRecursive } from '../utils/fs.js'
7+
import { deleteFileIfExists, determineFileType, getFile, getFileStatsIfExists, listDirectoryContentsRecursive, ZIM_STORAGE_PATH } from '../utils/fs.js'
88
import { PDFParse } from 'pdf-parse'
99
import { createWorker } from 'tesseract.js'
1010
import { fromBuffer } from 'pdf2pic'
@@ -399,14 +399,14 @@ export class RagService {
399399
totalArticles?: number
400400
}> {
401401
const zimExtractionService = new ZIMExtractionService()
402-
402+
403403
// Process in batches to avoid lock timeout
404404
const startOffset = batchOffset || 0
405-
405+
406406
logger.info(
407407
`[RAG] Extracting ZIM content (batch: offset=${startOffset}, size=${ZIM_BATCH_SIZE})`
408408
)
409-
409+
410410
const zimChunks = await zimExtractionService.extractZIMContent(filepath, {
411411
startOffset,
412412
batchSize: ZIM_BATCH_SIZE,
@@ -935,4 +935,149 @@ export class RagService {
935935
return { success: false, message: 'Error discovering Nomad docs.' }
936936
}
937937
}
938+
939+
/**
940+
* Scans the knowledge base storage directories and syncs with Qdrant.
941+
* Identifies files that exist in storage but haven't been embedded yet,
942+
* and dispatches EmbedFileJob for each missing file.
943+
*
944+
* @returns Object containing success status, message, and counts of scanned/queued files
945+
*/
946+
public async scanAndSyncStorage(): Promise<{
947+
success: boolean
948+
message: string
949+
filesScanned?: number
950+
filesQueued?: number
951+
}> {
952+
try {
953+
logger.info('[RAG] Starting knowledge base sync scan')
954+
955+
const KB_UPLOADS_PATH = join(process.cwd(), RagService.UPLOADS_STORAGE_PATH)
956+
const ZIM_PATH = join(process.cwd(), ZIM_STORAGE_PATH)
957+
958+
const filesInStorage: string[] = []
959+
960+
// Force resync of Nomad docs
961+
await this.discoverNomadDocs(true).catch((error) => {
962+
logger.error('[RAG] Error during Nomad docs discovery in sync process:', error)
963+
})
964+
965+
// Scan kb_uploads directory
966+
try {
967+
const kbContents = await listDirectoryContentsRecursive(KB_UPLOADS_PATH)
968+
kbContents.forEach((entry) => {
969+
if (entry.type === 'file') {
970+
filesInStorage.push(entry.key)
971+
}
972+
})
973+
logger.debug(`[RAG] Found ${kbContents.length} files in ${RagService.UPLOADS_STORAGE_PATH}`)
974+
} catch (error) {
975+
if (error.code === 'ENOENT') {
976+
logger.debug(`[RAG] ${RagService.UPLOADS_STORAGE_PATH} directory does not exist, skipping`)
977+
} else {
978+
throw error
979+
}
980+
}
981+
982+
// Scan zim directory
983+
try {
984+
const zimContents = await listDirectoryContentsRecursive(ZIM_PATH)
985+
zimContents.forEach((entry) => {
986+
if (entry.type === 'file') {
987+
filesInStorage.push(entry.key)
988+
}
989+
})
990+
logger.debug(`[RAG] Found ${zimContents.length} files in ${ZIM_STORAGE_PATH}`)
991+
} catch (error) {
992+
if (error.code === 'ENOENT') {
993+
logger.debug(`[RAG] ${ZIM_STORAGE_PATH} directory does not exist, skipping`)
994+
} else {
995+
throw error
996+
}
997+
}
998+
999+
logger.info(`[RAG] Found ${filesInStorage.length} total files in storage directories`)
1000+
1001+
// Get all stored sources from Qdrant
1002+
await this._ensureCollection(
1003+
RagService.CONTENT_COLLECTION_NAME,
1004+
RagService.EMBEDDING_DIMENSION
1005+
)
1006+
1007+
const sourcesInQdrant = new Set<string>()
1008+
let offset: string | number | null | Record<string, unknown> = null
1009+
const batchSize = 100
1010+
1011+
// Scroll through all points to get sources
1012+
do {
1013+
const scrollResult = await this.qdrant!.scroll(RagService.CONTENT_COLLECTION_NAME, {
1014+
limit: batchSize,
1015+
offset: offset,
1016+
with_payload: ['source'], // Only fetch source field for efficiency
1017+
with_vector: false,
1018+
})
1019+
1020+
scrollResult.points.forEach((point) => {
1021+
const source = point.payload?.source
1022+
if (source && typeof source === 'string') {
1023+
sourcesInQdrant.add(source)
1024+
}
1025+
})
1026+
1027+
offset = scrollResult.next_page_offset || null
1028+
} while (offset !== null)
1029+
1030+
logger.info(`[RAG] Found ${sourcesInQdrant.size} unique sources in Qdrant`)
1031+
1032+
// Find files that are in storage but not in Qdrant
1033+
const filesToEmbed = filesInStorage.filter((filePath) => !sourcesInQdrant.has(filePath))
1034+
1035+
logger.info(`[RAG] Found ${filesToEmbed.length} files that need embedding`)
1036+
1037+
if (filesToEmbed.length === 0) {
1038+
return {
1039+
success: true,
1040+
message: 'Knowledge base is already in sync',
1041+
filesScanned: filesInStorage.length,
1042+
filesQueued: 0,
1043+
}
1044+
}
1045+
1046+
// Import EmbedFileJob dynamically to avoid circular dependencies
1047+
const { EmbedFileJob } = await import('#jobs/embed_file_job')
1048+
1049+
// Dispatch jobs for files that need embedding
1050+
let queuedCount = 0
1051+
for (const filePath of filesToEmbed) {
1052+
try {
1053+
const fileName = filePath.split(/[/\\]/).pop() || filePath
1054+
const stats = await getFileStatsIfExists(filePath)
1055+
1056+
logger.info(`[RAG] Dispatching embed job for: ${fileName}`)
1057+
await EmbedFileJob.dispatch({
1058+
filePath: filePath,
1059+
fileName: fileName,
1060+
fileSize: stats?.size,
1061+
})
1062+
queuedCount++
1063+
logger.debug(`[RAG] Successfully dispatched job for ${fileName}`)
1064+
} catch (fileError) {
1065+
logger.error(`[RAG] Error dispatching job for file ${filePath}:`, fileError)
1066+
}
1067+
}
1068+
1069+
return {
1070+
success: true,
1071+
message: `Scanned ${filesInStorage.length} files, queued ${queuedCount} for embedding`,
1072+
filesScanned: filesInStorage.length,
1073+
filesQueued: queuedCount,
1074+
}
1075+
} catch (error) {
1076+
logger.error('[RAG] Error scanning and syncing knowledge base:', error)
1077+
return {
1078+
success: false,
1079+
message: 'Error scanning and syncing knowledge base',
1080+
}
1081+
}
1082+
}
9381083
}

admin/config/logger.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ const loggerConfig = defineConfig({
1313
app: {
1414
enabled: true,
1515
name: env.get('APP_NAME'),
16-
level: env.get('LOG_LEVEL'),
16+
level: env.get('NODE_ENV') === 'production' ? env.get('LOG_LEVEL') : 'debug', // default to 'debug' in non-production envs
1717
transport: {
1818
targets:
1919
targets()

admin/inertia/components/chat/KnowledgeBaseModal.tsx

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import StyledTable from '~/components/StyledTable'
77
import { useNotifications } from '~/context/NotificationContext'
88
import api from '~/lib/api'
99
import { IconX } from '@tabler/icons-react'
10+
import { useModals } from '~/context/ModalContext'
11+
import StyledModal from '../StyledModal'
1012

1113
interface KnowledgeBaseModalProps {
1214
onClose: () => void
@@ -16,6 +18,7 @@ export default function KnowledgeBaseModal({ onClose }: KnowledgeBaseModalProps)
1618
const { addNotification } = useNotifications()
1719
const [files, setFiles] = useState<File[]>([])
1820
const fileUploaderRef = useRef<React.ComponentRef<typeof FileUploader>>(null)
21+
const { openModal, closeModal } = useModals()
1922

2023
const { data: storedFiles = [], isLoading: isLoadingFiles } = useQuery({
2124
queryKey: ['storedFiles'],
@@ -43,12 +46,53 @@ export default function KnowledgeBaseModal({ onClose }: KnowledgeBaseModalProps)
4346
},
4447
})
4548

49+
const syncMutation = useMutation({
50+
mutationFn: () => api.syncRAGStorage(),
51+
onSuccess: (data) => {
52+
addNotification({
53+
type: 'success',
54+
message: data?.message || 'Storage synced successfully. If new files were found, they have been queued for processing.',
55+
})
56+
},
57+
onError: (error: any) => {
58+
addNotification({
59+
type: 'error',
60+
message: error?.message || 'Failed to sync storage',
61+
})
62+
},
63+
})
64+
4665
const handleUpload = () => {
4766
if (files.length > 0) {
4867
uploadMutation.mutate(files[0])
4968
}
5069
}
5170

71+
const handleConfirmSync = () => {
72+
openModal(
73+
<StyledModal
74+
title='Confirm Sync?'
75+
onConfirm={() => {
76+
syncMutation.mutate()
77+
closeModal(
78+
"confirm-sync-modal"
79+
)
80+
}}
81+
onCancel={() => closeModal("confirm-sync-modal")}
82+
open={true}
83+
confirmText='Confirm Sync'
84+
cancelText='Cancel'
85+
confirmVariant='primary'
86+
>
87+
<p className='text-gray-700'>
88+
This will scan the NOMAD's storage directories for any new files and queue them for processing. This is useful if you've manually added files to the storage or want to ensure everything is up to date.
89+
This may cause a temporary increase in resource usage if new files are found and being processed. Are you sure you want to proceed?
90+
</p>
91+
</StyledModal>,
92+
"confirm-sync-modal"
93+
)
94+
}
95+
5296
return (
5397
<div className="fixed inset-0 z-50 flex items-center justify-center p-4 bg-black/30 backdrop-blur-sm transition-opacity">
5498
<div className="bg-white rounded-lg shadow-xl max-w-4xl w-full max-h-[90vh] overflow-hidden flex flex-col">
@@ -142,7 +186,19 @@ export default function KnowledgeBaseModal({ onClose }: KnowledgeBaseModalProps)
142186
</div>
143187
</div>
144188
<div className="my-12">
145-
<StyledSectionHeader title="Stored Knowledge Base Files" />
189+
<div className='flex items-center justify-between mb-6'>
190+
<StyledSectionHeader title="Stored Knowledge Base Files" className='!mb-0' />
191+
<StyledButton
192+
variant="secondary"
193+
size="md"
194+
icon='IconRefresh'
195+
onClick={handleConfirmSync}
196+
disabled={syncMutation.isPending || uploadMutation.isPending}
197+
loading={syncMutation.isPending || uploadMutation.isPending}
198+
>
199+
Sync Storage
200+
</StyledButton>
201+
</div>
146202
<StyledTable<{ source: string }>
147203
className="font-semibold"
148204
rowLines={true}

admin/inertia/lib/api.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,18 @@ class API {
448448
})()
449449
}
450450

451+
async syncRAGStorage() {
452+
return catchInternal(async () => {
453+
const response = await this.client.post<{
454+
success: boolean
455+
message: string
456+
filesScanned?: number
457+
filesQueued?: number
458+
}>('/rag/sync')
459+
return response.data
460+
})()
461+
}
462+
451463
// Wikipedia selector methods
452464

453465
async getWikipediaState(): Promise<WikipediaState | undefined> {

admin/start/routes.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ router
119119
router.post('/upload', [RagController, 'upload'])
120120
router.get('/files', [RagController, 'getStoredFiles'])
121121
router.get('/job-status', [RagController, 'getJobStatus'])
122+
router.post('/sync', [RagController, 'scanAndSync'])
122123
})
123124
.prefix('/api/rag')
124125

0 commit comments

Comments
 (0)