simstudioai · waleedlatif1 · Mar 27, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/.agents/skills/add-connector/SKILL.md b/.agents/skills/add-connector/SKILL.md
@@ -71,12 +71,14 @@ export const {service}Connector: ConnectorConfig = {
   ],
 
   listDocuments: async (accessToken, sourceConfig, cursor) => {
-    // Paginate via cursor, extract text, compute SHA-256 hash
+    // Return metadata stubs with contentDeferred: true (if per-doc content fetch needed)
+    // Or full documents with content (if list API returns content inline)
     // Return { documents: ExternalDocument[], nextCursor?, hasMore }
   },
 
   getDocument: async (accessToken, sourceConfig, externalId) => {
-    // Return ExternalDocument or null
+    // Fetch full content for a single document
+    // Return ExternalDocument with contentDeferred: false, or null
   },
 
   validateConfig: async (accessToken, sourceConfig) => {
@@ -281,26 +283,110 @@ Every document returned from `listDocuments`/`getDocument` must include:
 {
   externalId: string          // Source-specific unique ID
   title: string               // Document title
-  content: string             // Extracted plain text
+  content: string             // Extracted plain text (or '' if contentDeferred)
+  contentDeferred?: boolean   // true = content will be fetched via getDocument
   mimeType: 'text/plain'     // Always text/plain (content is extracted)
-  contentHash: string         // SHA-256 of content (change detection)
+  contentHash: string         // Metadata-based hash for change detection
   sourceUrl?: string          // Link back to original (stored on document record)
   metadata?: Record<string, unknown>  // Source-specific data (fed to mapTags)
 }
 ```
 
-## Content Hashing (Required)
+## Content Deferral (Required for file/content-download connectors)
 
-The sync engine uses content hashes for change detection:
+**All connectors that require per-document API calls to fetch content MUST use `contentDeferred: true`.** This is the standard pattern — `listDocuments` returns lightweight metadata stubs, and content is fetched lazily by the sync engine via `getDocument` only for new/changed documents.
+
+This pattern is critical for reliability: the sync engine processes documents in batches and enqueues each batch for processing immediately. If a sync times out, all previously-batched documents are already queued. Without deferral, content downloads during listing can exhaust the sync task's time budget before any documents are saved.
+
+### When to use `contentDeferred: true`
+
+- The service's list API does NOT return document content (only metadata)
+- Content requires a separate download/export API call per document
+- Examples: Google Drive, OneDrive, SharePoint, Dropbox, Notion, Confluence, Gmail, Obsidian, Evernote, GitHub
+
+### When NOT to use `contentDeferred`
+
+- The list API already returns the full content inline (e.g., Slack messages, Reddit posts, HubSpot notes)
+- No per-document API call is needed to get content
+
+### Content Hash Strategy
+
+Use a **metadata-based** `contentHash` — never a content-based hash. The hash must be derivable from the list response metadata alone, so the sync engine can detect changes without downloading content.
+
+Good metadata hash sources:
+- `modifiedTime` / `lastModifiedDateTime` — changes when file is edited
+- Git blob SHA — unique per content version
+- API-provided content hash (e.g., Dropbox `content_hash`)
+- Version number (e.g., Confluence page version)
+
+Format: `{service}:{id}:{changeIndicator}`
 
 ```typescript
-async function computeContentHash(content: string): Promise<string> {
-  const data = new TextEncoder().encode(content)
-  const hashBuffer = await crypto.subtle.digest('SHA-256', data)
-  return Array.from(new Uint8Array(hashBuffer)).map(b => b.toString(16).padStart(2, '0')).join('')
+// Google Drive: modifiedTime changes on edit
+contentHash: `gdrive:${file.id}:${file.modifiedTime ?? ''}`
+
+// GitHub: blob SHA is a content-addressable hash
+contentHash: `gitsha:${item.sha}`
+
+// Dropbox: API provides content_hash
+contentHash: `dropbox:${entry.id}:${entry.content_hash ?? entry.server_modified}`
+
+// Confluence: version number increments on edit
+contentHash: `confluence:${page.id}:${page.version.number}`
+```
+
+**Critical invariant:** The `contentHash` MUST be identical whether produced by `listDocuments` (stub) or `getDocument` (full doc). Both should use the same stub function to guarantee this.
+
+### Implementation Pattern
+
+```typescript
+// 1. Create a stub function (sync, no API calls)
+function fileToStub(file: ServiceFile): ExternalDocument {
+  return {
+    externalId: file.id,
+    title: file.name || 'Untitled',
+    content: '',
+    contentDeferred: true,
+    mimeType: 'text/plain',
+    sourceUrl: `https://service.com/file/${file.id}`,
+    contentHash: `service:${file.id}:${file.modifiedTime ?? ''}`,
+    metadata: { /* fields needed by mapTags */ },
+  }
+}
+
+// 2. listDocuments returns stubs (fast, metadata only)
+listDocuments: async (accessToken, sourceConfig, cursor) => {
+  const response = await fetchWithRetry(listUrl, { ... })
+  const files = (await response.json()).files
+  const documents = files.map(fileToStub)
+  return { documents, nextCursor, hasMore }
+}
+
+// 3. getDocument fetches content and returns full doc with SAME contentHash
+getDocument: async (accessToken, sourceConfig, externalId) => {
+  const metadata = await fetchWithRetry(metadataUrl, { ... })
+  const file = await metadata.json()
+  if (file.trashed) return null
+
+  try {
+    const content = await fetchContent(accessToken, file)
+    if (!content.trim()) return null
+    const stub = fileToStub(file)
+    return { ...stub, content, contentDeferred: false }
+  } catch (error) {
+    logger.warn(`Failed to fetch content for: ${file.name}`, { error })
+    return null
+  }
 }
 ```
 
+### Reference Implementations
+
+- **Google Drive**: `connectors/google-drive/google-drive.ts` — file download/export with `modifiedTime` hash
+- **GitHub**: `connectors/github/github.ts` — git blob SHA hash
+- **Notion**: `connectors/notion/notion.ts` — blocks API with `last_edited_time` hash
+- **Confluence**: `connectors/confluence/confluence.ts` — version number hash
+
 ## tagDefinitions — Declared Tag Definitions
 
 Declare which tags the connector populates using semantic IDs. Shown in the add-connector modal as opt-out checkboxes.
@@ -409,7 +495,10 @@ export const CONNECTOR_REGISTRY: ConnectorRegistry = {
 
 ## Reference Implementations
 
-- **OAuth**: `apps/sim/connectors/confluence/confluence.ts` — multiple config field types, `mapTags`, label fetching
+- **OAuth + contentDeferred**: `apps/sim/connectors/google-drive/google-drive.ts` — file download with metadata-based hash, `orderBy` for deterministic pagination
+- **OAuth + contentDeferred (blocks API)**: `apps/sim/connectors/notion/notion.ts` — complex block content extraction deferred to `getDocument`
+- **OAuth + contentDeferred (git)**: `apps/sim/connectors/github/github.ts` — blob SHA hash, tree listing
+- **OAuth + inline content**: `apps/sim/connectors/confluence/confluence.ts` — multiple config field types, `mapTags`, label fetching
 - **API key**: `apps/sim/connectors/fireflies/fireflies.ts` — GraphQL API with Bearer token auth
 
 ## Checklist
@@ -425,7 +514,9 @@ export const CONNECTOR_REGISTRY: ConnectorRegistry = {
   - `selectorKey` exists in `hooks/selectors/registry.ts`
   - `dependsOn` references selector field IDs (not `canonicalParamId`)
   - Dependency `canonicalParamId` values exist in `SELECTOR_CONTEXT_FIELDS`
-- [ ] `listDocuments` handles pagination and computes content hashes
+- [ ] `listDocuments` handles pagination with metadata-based content hashes
+- [ ] `contentDeferred: true` used if content requires per-doc API calls (file download, export, blocks fetch)
+- [ ] `contentHash` is metadata-based (not content-based) and identical between stub and `getDocument`
 - [ ] `sourceUrl` set on each ExternalDocument (full URL, not relative)
 - [ ] `metadata` includes source-specific data for tag mapping
 - [ ] `tagDefinitions` declared for each semantic key returned by `mapTags`

diff --git a/.agents/skills/validate-connector/SKILL.md b/.agents/skills/validate-connector/SKILL.md
@@ -141,12 +141,24 @@ For each API endpoint the connector calls:
 
 ## Step 6: Validate Data Transformation
 
+### Content Deferral (CRITICAL)
+Connectors that require per-document API calls to fetch content (file download, export, blocks fetch) MUST use `contentDeferred: true`. This is the standard pattern for reliability — without it, content downloads during listing can exhaust the sync task's time budget before any documents are saved.
+
+- [ ] If the connector downloads content per-doc during `listDocuments`, it MUST use `contentDeferred: true` instead
+- [ ] `listDocuments` returns lightweight stubs with `content: ''` and `contentDeferred: true`
+- [ ] `getDocument` fetches actual content and returns the full document with `contentDeferred: false`
+- [ ] A shared stub function (e.g., `fileToStub`) is used by both `listDocuments` and `getDocument` to guarantee `contentHash` consistency
+- [ ] `contentHash` is metadata-based (e.g., `service:{id}:{modifiedTime}`), NOT content-based — it must be derivable from list metadata alone
+- [ ] The `contentHash` is identical whether produced by `listDocuments` or `getDocument`
+
+Connectors where the list API already returns content inline (e.g., Slack messages, Reddit posts) do NOT need `contentDeferred`.
+
 ### ExternalDocument Construction
 - [ ] `externalId` is a stable, unique identifier from the source API
 - [ ] `title` is extracted from the correct field and has a sensible fallback (e.g., `'Untitled'`)
 - [ ] `content` is plain text — HTML content is stripped using `htmlToPlainText` from `@/connectors/utils`
 - [ ] `mimeType` is `'text/plain'`
-- [ ] `contentHash` is computed using `computeContentHash` from `@/connectors/utils`
+- [ ] `contentHash` uses a metadata-based format (e.g., `service:{id}:{modifiedTime}`) for connectors with `contentDeferred: true`, or `computeContentHash` from `@/connectors/utils` for inline-content connectors
 - [ ] `sourceUrl` is a valid, complete URL back to the original resource (not relative)
 - [ ] `metadata` contains all fields referenced by `mapTags` and `tagDefinitions`
 
@@ -200,6 +212,8 @@ For each API endpoint the connector calls:
 - [ ] Fetches a single document by `externalId`
 - [ ] Returns `null` for 404 / not found (does not throw)
 - [ ] Returns the same `ExternalDocument` shape as `listDocuments`
+- [ ] If `listDocuments` uses `contentDeferred: true`, `getDocument` MUST fetch actual content and return `contentDeferred: false`
+- [ ] If `listDocuments` uses `contentDeferred: true`, `getDocument` MUST use the same stub function to ensure `contentHash` is identical
 - [ ] Handles all content types that `listDocuments` can produce (e.g., if `listDocuments` returns both pages and blogposts, `getDocument` must handle both — not hardcode one endpoint)
 - [ ] Forwards `syncContext` if it needs cached state (user names, field maps, etc.)
 - [ ] Error handling is graceful (catches, logs, returns null or throws with context)
@@ -253,6 +267,8 @@ Group findings by severity:
 - Missing error handling that would crash the sync
 - `requiredScopes` not a subset of OAuth provider scopes
 - Query/filter injection: user-controlled values interpolated into OData `$filter`, SOQL, or query strings without escaping
+- Per-document content download in `listDocuments` without `contentDeferred: true` — causes sync timeouts for large document sets
+- `contentHash` mismatch between `listDocuments` stub and `getDocument` return — causes unnecessary re-processing every sync
 
 **Warning** (incorrect behavior, data quality issues, or convention violations):
 - HTML content not stripped via `htmlToPlainText`
@@ -300,6 +316,7 @@ After fixing, confirm:
 - [ ] Validated scopes are sufficient for all API endpoints the connector calls
 - [ ] Validated token refresh config (`useBasicAuth`, `supportsRefreshTokenRotation`)
 - [ ] Validated pagination: cursor names, page sizes, hasMore logic, no silent caps
+- [ ] Validated content deferral: `contentDeferred: true` used when per-doc content fetch required, metadata-based `contentHash` consistent between stub and `getDocument`
 - [ ] Validated data transformation: plain text extraction, HTML stripping, content hashing
 - [ ] Validated tag definitions match mapTags output, correct fieldTypes
 - [ ] Validated config fields: canonical pairs, selector keys, required flags