From 42092406ce799dc801f4bc8a2863b613d95cd992 Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Sat, 21 Mar 2026 14:55:05 -0700 Subject: [PATCH 1/6] fix(kb): store filename with .txt extension for connector documents Connector documents (e.g. Fireflies transcripts) have titles without file extensions. The DB stored the raw title as filename, but the processing pipeline extracts file extension from filename to determine the parser. On retry/reprocess, this caused "Unsupported file type" errors with the document title treated as the extension. Now stores processingFilename (which includes .txt) instead of the raw title, consistent with what was actually uploaded to storage. --- apps/sim/lib/knowledge/connectors/sync-engine.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/sim/lib/knowledge/connectors/sync-engine.ts b/apps/sim/lib/knowledge/connectors/sync-engine.ts index a07005a95d9..80f6298179f 100644 --- a/apps/sim/lib/knowledge/connectors/sync-engine.ts +++ b/apps/sim/lib/knowledge/connectors/sync-engine.ts @@ -742,7 +742,7 @@ async function addDocument( await tx.insert(document).values({ id: documentId, knowledgeBaseId, - filename: extDoc.title, + filename: processingFilename, fileUrl, fileSize: contentBuffer.length, mimeType: 'text/plain', @@ -841,7 +841,7 @@ async function updateDocument( await tx .update(document) .set({ - filename: extDoc.title, + filename: processingFilename, fileUrl, fileSize: contentBuffer.length, contentHash: extDoc.contentHash, From 57437f7a3e2f0d887332ae5f855bcf0cf09cd790 Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Sat, 21 Mar 2026 15:00:52 -0700 Subject: [PATCH 2/6] fix(kb): guard stuck document retry against filenames without extension Existing DB rows may have connector document filenames stored without a .txt extension (raw meeting titles). The stuck-doc retry path reads filename from DB and passes it to parseHttpFile, which extracts the extension via split('.'). When there's no dot, the entire title becomes the "extension", causing "Unsupported file type" errors. Falls back to 'document.txt' when the stored filename has no extension. --- apps/sim/lib/knowledge/connectors/sync-engine.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/sim/lib/knowledge/connectors/sync-engine.ts b/apps/sim/lib/knowledge/connectors/sync-engine.ts index 80f6298179f..9de6c8675b9 100644 --- a/apps/sim/lib/knowledge/connectors/sync-engine.ts +++ b/apps/sim/lib/knowledge/connectors/sync-engine.ts @@ -562,7 +562,8 @@ export async function executeSync( connector.knowledgeBaseId, doc.id, { - filename: doc.filename ?? 'document.txt', + filename: + doc.filename && doc.filename.includes('.') ? doc.filename : 'document.txt', fileUrl: doc.fileUrl ?? '', fileSize: doc.fileSize ?? 0, mimeType: 'text/plain', From d5a3ce2313bd3c9d98668a12f92498c8e4d46e4d Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Sun, 22 Mar 2026 03:10:57 -0700 Subject: [PATCH 3/6] fix(kb): fix race condition in stuck document retry during sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The stuck document retry at the end of each sync was querying for all documents with processingStatus 'pending' or 'failed'. This included documents added in the CURRENT sync that were still processing asynchronously, causing duplicate concurrent processing attempts. The race between the original (correct) processing and the retry (which reads the raw title from DB as filename) produced nondeterministic failures — some documents would succeed while others would fail with "Unsupported file type: ". Fixes: - Filter stuck doc query by uploadedAt < syncStartedAt to exclude documents from the current sync - Pass mimeType through to parseHttpFile so text/plain content can be decoded directly without requiring a file extension in the filename (matches parseDataURI which already handles this) - Restore filename as extDoc.title in DB (the display name, not the processing filename) --- .../lib/knowledge/connectors/sync-engine.ts | 20 +++++++------ .../knowledge/documents/document-processor.ts | 28 ++++++++++++++++--- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/apps/sim/lib/knowledge/connectors/sync-engine.ts b/apps/sim/lib/knowledge/connectors/sync-engine.ts index 9de6c8675b9..efef605f52c 100644 --- a/apps/sim/lib/knowledge/connectors/sync-engine.ts +++ b/apps/sim/lib/knowledge/connectors/sync-engine.ts @@ -6,7 +6,7 @@ import { knowledgeConnectorSyncLog, } from '@sim/db/schema' import { createLogger } from '@sim/logger' -import { and, eq, inArray, isNull, ne, sql } from 'drizzle-orm' +import { and, eq, inArray, isNull, lt, ne, sql } from 'drizzle-orm' import { decryptApiKey } from '@/lib/api-key/crypto' import { getInternalApiBaseUrl } from '@/lib/core/utils/urls' import { @@ -272,11 +272,12 @@ export async function executeSync( } const syncLogId = crypto.randomUUID() + const syncStartedAt = new Date() await db.insert(knowledgeConnectorSyncLog).values({ id: syncLogId, connectorId, status: 'started', - startedAt: new Date(), + startedAt: syncStartedAt, }) let syncExitedCleanly = false @@ -536,19 +537,23 @@ export async function executeSync( throw new Error(`Knowledge base ${connector.knowledgeBaseId} was deleted during sync`) } - // Retry stuck documents that failed or never completed processing + // Retry stuck documents that failed or never completed processing. + // Only retry docs uploaded BEFORE this sync — docs added in the current sync + // are still processing asynchronously and would cause a duplicate processing race. const stuckDocs = await db .select({ id: document.id, fileUrl: document.fileUrl, filename: document.filename, fileSize: document.fileSize, + mimeType: document.mimeType, }) .from(document) .where( and( eq(document.connectorId, connectorId), inArray(document.processingStatus, ['pending', 'failed']), + lt(document.uploadedAt, syncStartedAt), eq(document.userExcluded, false), isNull(document.archivedAt), isNull(document.deletedAt) @@ -562,11 +567,10 @@ export async function executeSync( connector.knowledgeBaseId, doc.id, { - filename: - doc.filename && doc.filename.includes('.') ? doc.filename : 'document.txt', + filename: doc.filename ?? 'document.txt', fileUrl: doc.fileUrl ?? '', fileSize: doc.fileSize ?? 0, - mimeType: 'text/plain', + mimeType: doc.mimeType ?? 'text/plain', }, {} ).catch((error) => { @@ -743,7 +747,7 @@ async function addDocument( await tx.insert(document).values({ id: documentId, knowledgeBaseId, - filename: processingFilename, + filename: extDoc.title, fileUrl, fileSize: contentBuffer.length, mimeType: 'text/plain', @@ -842,7 +846,7 @@ async function updateDocument( await tx .update(document) .set({ - filename: processingFilename, + filename: extDoc.title, fileUrl, fileSize: contentBuffer.length, contentHash: extDoc.contentHash, diff --git a/apps/sim/lib/knowledge/documents/document-processor.ts b/apps/sim/lib/knowledge/documents/document-processor.ts index 0185de495b1..a68cc0a7c1b 100644 --- a/apps/sim/lib/knowledge/documents/document-processor.ts +++ b/apps/sim/lib/knowledge/documents/document-processor.ts @@ -727,7 +727,7 @@ async function parseWithFileParser(fileUrl: string, filename: string, mimeType: if (fileUrl.startsWith('data:')) { content = await parseDataURI(fileUrl, filename, mimeType) } else if (fileUrl.startsWith('http')) { - const result = await parseHttpFile(fileUrl, filename) + const result = await parseHttpFile(fileUrl, filename, mimeType) content = result.content metadata = result.metadata || {} } else { @@ -765,15 +765,35 @@ async function parseDataURI(fileUrl: string, filename: string, mimeType: string) return result.content } +const MIME_TO_EXTENSION: Record = { + 'text/plain': 'txt', + 'text/markdown': 'md', + 'text/csv': 'csv', + 'text/html': 'html', + 'application/pdf': 'pdf', + 'application/json': 'json', + 'application/yaml': 'yaml', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx', + 'application/msword': 'doc', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx', + 'application/vnd.ms-excel': 'xls', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx', + 'application/vnd.ms-powerpoint': 'ppt', +} + async function parseHttpFile( fileUrl: string, - filename: string + filename: string, + mimeType?: string ): Promise<{ content: string; metadata?: FileParseMetadata }> { const buffer = await downloadFileWithTimeout(fileUrl) - const extension = filename.split('.').pop()?.toLowerCase() + let extension = filename.split('.').pop()?.toLowerCase() + if (!extension || extension === filename.toLowerCase()) { + extension = mimeType ? MIME_TO_EXTENSION[mimeType] : undefined + } if (!extension) { - throw new Error(`Could not determine file extension: ${filename}`) + throw new Error(`Could not determine file type for: ${filename}`) } const result = await parseBuffer(buffer, extension) From 401d801868bc67bf089e62644cb125bc1d640023 Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Sun, 22 Mar 2026 03:18:59 -0700 Subject: [PATCH 4/6] fix(kb): fix race condition in stuck document retry during sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The stuck document retry at the end of each sync was querying for all documents with processingStatus 'pending' or 'failed'. This included documents added in the CURRENT sync that were still processing asynchronously, causing duplicate concurrent processing attempts. The race between the original (correct) processing and the retry (which reads the raw title from DB as filename) produced nondeterministic failures — some documents would succeed while others would fail with "Unsupported file type: ". Fixes: - Filter stuck doc query by uploadedAt < syncStartedAt to exclude documents from the current sync - Pass mimeType through to parseHttpFile and use existing getExtensionFromMimeType utility as fallback when filename has no extension (e.g. Fireflies meeting titles) - Apply same mimeType fallback in parseDataURI for consistency --- .../knowledge/documents/document-processor.ts | 30 ++++++------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/apps/sim/lib/knowledge/documents/document-processor.ts b/apps/sim/lib/knowledge/documents/document-processor.ts index a68cc0a7c1b..1b614f4b223 100644 --- a/apps/sim/lib/knowledge/documents/document-processor.ts +++ b/apps/sim/lib/knowledge/documents/document-processor.ts @@ -7,7 +7,7 @@ import { parseBuffer, parseFile } from '@/lib/file-parsers' import type { FileParseMetadata } from '@/lib/file-parsers/types' import { retryWithExponentialBackoff } from '@/lib/knowledge/documents/utils' import { StorageService } from '@/lib/uploads' -import { isInternalFileUrl } from '@/lib/uploads/utils/file-utils' +import { getExtensionFromMimeType, isInternalFileUrl } from '@/lib/uploads/utils/file-utils' import { downloadFileFromUrl } from '@/lib/uploads/utils/file-utils.server' import { mistralParserTool } from '@/tools/mistral/parser' @@ -759,28 +759,14 @@ async function parseDataURI(fileUrl: string, filename: string, mimeType: string) : decodeURIComponent(base64Data) } - const extension = filename.split('.').pop()?.toLowerCase() || 'txt' + const extension = filename.includes('.') + ? filename.split('.').pop()!.toLowerCase() + : getExtensionFromMimeType(mimeType) ?? 'txt' const buffer = Buffer.from(base64Data, 'base64') const result = await parseBuffer(buffer, extension) return result.content } -const MIME_TO_EXTENSION: Record = { - 'text/plain': 'txt', - 'text/markdown': 'md', - 'text/csv': 'csv', - 'text/html': 'html', - 'application/pdf': 'pdf', - 'application/json': 'json', - 'application/yaml': 'yaml', - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx', - 'application/msword': 'doc', - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx', - 'application/vnd.ms-excel': 'xls', - 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx', - 'application/vnd.ms-powerpoint': 'ppt', -} - async function parseHttpFile( fileUrl: string, filename: string, @@ -788,9 +774,11 @@ async function parseHttpFile( ): Promise<{ content: string; metadata?: FileParseMetadata }> { const buffer = await downloadFileWithTimeout(fileUrl) - let extension = filename.split('.').pop()?.toLowerCase() - if (!extension || extension === filename.toLowerCase()) { - extension = mimeType ? MIME_TO_EXTENSION[mimeType] : undefined + let extension = filename.includes('.') + ? filename.split('.').pop()?.toLowerCase() + : undefined + if (!extension && mimeType) { + extension = getExtensionFromMimeType(mimeType) ?? undefined } if (!extension) { throw new Error(`Could not determine file type for: ${filename}`) From 7c43faaacbcc2f5408fc07aafb88ee91d78e7be6 Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Sun, 22 Mar 2026 03:22:09 -0700 Subject: [PATCH 5/6] lint --- apps/sim/hooks/queries/kb/connectors.ts | 19 +++++++++++++++++-- .../knowledge/documents/document-processor.ts | 6 ++---- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/apps/sim/hooks/queries/kb/connectors.ts b/apps/sim/hooks/queries/kb/connectors.ts index fd86f306e0a..1b737d8f2a5 100644 --- a/apps/sim/hooks/queries/kb/connectors.ts +++ b/apps/sim/hooks/queries/kb/connectors.ts @@ -88,6 +88,21 @@ async function fetchConnectorDetail( return result.data } +/** Stop polling for initial sync after 2 minutes */ +const PENDING_SYNC_WINDOW_MS = 2 * 60 * 1000 + +/** + * Checks if a connector is syncing or awaiting its first sync within the allowed window + */ +export function isConnectorSyncingOrPending(connector: ConnectorData): boolean { + if (connector.status === 'syncing') return true + return ( + connector.status === 'active' && + !connector.lastSyncAt && + Date.now() - new Date(connector.createdAt).getTime() < PENDING_SYNC_WINDOW_MS + ) +} + export function useConnectorList(knowledgeBaseId?: string) { return useQuery({ queryKey: connectorKeys.list(knowledgeBaseId), @@ -97,8 +112,8 @@ export function useConnectorList(knowledgeBaseId?: string) { placeholderData: keepPreviousData, refetchInterval: (query) => { const connectors = query.state.data - const hasSyncing = connectors?.some((c) => c.status === 'syncing') - return hasSyncing ? 3000 : false + if (!connectors?.length) return false + return connectors.some(isConnectorSyncingOrPending) ? 3000 : false }, }) } diff --git a/apps/sim/lib/knowledge/documents/document-processor.ts b/apps/sim/lib/knowledge/documents/document-processor.ts index 1b614f4b223..45d64caf496 100644 --- a/apps/sim/lib/knowledge/documents/document-processor.ts +++ b/apps/sim/lib/knowledge/documents/document-processor.ts @@ -761,7 +761,7 @@ async function parseDataURI(fileUrl: string, filename: string, mimeType: string) const extension = filename.includes('.') ? filename.split('.').pop()!.toLowerCase() - : getExtensionFromMimeType(mimeType) ?? 'txt' + : (getExtensionFromMimeType(mimeType) ?? 'txt') const buffer = Buffer.from(base64Data, 'base64') const result = await parseBuffer(buffer, extension) return result.content @@ -774,9 +774,7 @@ async function parseHttpFile( ): Promise<{ content: string; metadata?: FileParseMetadata }> { const buffer = await downloadFileWithTimeout(fileUrl) - let extension = filename.includes('.') - ? filename.split('.').pop()?.toLowerCase() - : undefined + let extension = filename.includes('.') ? filename.split('.').pop()?.toLowerCase() : undefined if (!extension && mimeType) { extension = getExtensionFromMimeType(mimeType) ?? undefined } From 7da1dd024c658bb55ef6ced88521f4698e29af77 Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Sun, 22 Mar 2026 03:33:35 -0700 Subject: [PATCH 6/6] fix(kb): handle empty extension edge case in parseDataURI When filename ends with a dot (e.g. "file."), split('.').pop() returns an empty string. Fall through to mimeType-based extension lookup instead of passing empty string to parseBuffer. Co-Authored-By: Claude Opus 4.6 --- apps/sim/lib/knowledge/documents/document-processor.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/sim/lib/knowledge/documents/document-processor.ts b/apps/sim/lib/knowledge/documents/document-processor.ts index 45d64caf496..72bf9007c9d 100644 --- a/apps/sim/lib/knowledge/documents/document-processor.ts +++ b/apps/sim/lib/knowledge/documents/document-processor.ts @@ -759,9 +759,10 @@ async function parseDataURI(fileUrl: string, filename: string, mimeType: string) : decodeURIComponent(base64Data) } - const extension = filename.includes('.') - ? filename.split('.').pop()!.toLowerCase() - : (getExtensionFromMimeType(mimeType) ?? 'txt') + const extension = + (filename.includes('.') ? filename.split('.').pop()?.toLowerCase() : undefined) || + getExtensionFromMimeType(mimeType) || + 'txt' const buffer = Buffer.from(base64Data, 'base64') const result = await parseBuffer(buffer, extension) return result.content