{"flow":{"id":70,"summary":"When new expenses are uploaded to Google Drive, extract text using Tesseract and notify on Slack","versions":[275,276],"created_by":"henri186","created_at":"2025-08-01T12:19:52.792Z","votes":0,"approved":false,"apps":["gdrive,","slack,","tesseract"],"value":{"modules":[{"id":"a","value":{"type":"rawscript","assets":[],"content":"import * as wmill from \"windmill-client\"\n\nexport async function main(\n  gdrive_resource: RT.Gdrive,\n  folder_id?: string,\n  file_extensions: string[] = [\"jpg\", \"jpeg\", \"png\", \"pdf\"]\n) {\n  const token = gdrive_resource.token;\n  \n  // Build query to search for files\n  let query = `trashed=false`;\n  \n  // If folder_id is provided, search within that folder\n  if (folder_id && folder_id.trim() !== '') {\n    query += ` and '${folder_id}' in parents`;\n  }\n  \n  // Add file extension filter\n  if (file_extensions.length > 0) {\n    const extensionQuery = file_extensions.map(ext => `name contains '.${ext.toLowerCase()}'`).join(' or ');\n    query += ` and (${extensionQuery})`;\n  }\n  \n  // Search for files modified in the last 24 hours\n  const yesterday = new Date();\n  yesterday.setDate(yesterday.getDate() - 1);\n  const isoDate = yesterday.toISOString();\n  query += ` and modifiedTime > '${isoDate}'`;\n  \n  const response = await fetch(`https://www.googleapis.com/drive/v3/files?q=${encodeURIComponent(query)}&orderBy=modifiedTime desc&fields=files(id,name,mimeType,modifiedTime,webViewLink)`, {\n    headers: {\n      'Authorization': `Bearer ${token}`,\n      'Content-Type': 'application/json'\n    }\n  });\n  \n  if (!response.ok) {\n    throw new Error(`Failed to list files: ${response.statusText}`);\n  }\n  \n  const data = await response.json();\n  \n  console.log(`Found ${data.files?.length || 0} recent expense files`);\n  \n  return {\n    files: data.files || [],\n    total_count: data.files?.length || 0\n  };\n}","language":"bun","input_transforms":{"folder_id":{"expr":"flow_input.folder_id","type":"javascript"},"file_extensions":{"expr":"flow_input.file_extensions","type":"javascript"},"gdrive_resource":{"expr":"flow_input.gdrive_resource","type":"javascript"}}},"summary":"List recent files"},{"id":"b","value":{"type":"forloopflow","modules":[{"id":"c","value":{"type":"rawscript","assets":[],"content":"import * as wmill from \"windmill-client\"\n\nexport async function main(\n  gdrive_resource: RT.Gdrive,\n  file: { id: string, name: string, mimeType: string, modifiedTime: string, webViewLink: string }\n) {\n  const token = gdrive_resource.token;\n  \n  console.log(`Downloading file: ${file.name} (${file.id})`);\n  \n  // Download the file content\n  const response = await fetch(`https://www.googleapis.com/drive/v3/files/${file.id}?alt=media`, {\n    headers: {\n      'Authorization': `Bearer ${token}`\n    }\n  });\n  \n  if (!response.ok) {\n    throw new Error(`Failed to download file ${file.name}: ${response.statusText}`);\n  }\n  \n  // Get file content as buffer\n  const fileBuffer = await response.arrayBuffer();\n  const fileContent = new Uint8Array(fileBuffer);\n  \n  console.log(`Downloaded ${file.name}, size: ${fileContent.length} bytes`);\n  \n  return {\n    file_info: file,\n    file_content: fileContent,\n    file_size: fileContent.length\n  };\n}","language":"bun","input_transforms":{"file":{"expr":"flow_input.iter.value","type":"javascript"},"gdrive_resource":{"expr":"flow_input.gdrive_resource","type":"javascript"}}},"summary":"Download file from Drive"},{"id":"d","value":{"type":"rawscript","assets":[],"content":"import pytesseract\nfrom PIL import Image\nimport io\nimport fitz  # PyMuPDF for PDF handling\nimport tempfile\nimport os\n\ndef main(\n    file_data: dict,  # Contains file_info, file_content, file_size from previous step\n):\n    \"\"\"\n    Extract text from image or PDF files using Tesseract OCR\n    \"\"\"\n    file_info = file_data[\"file_info\"]\n    file_content = bytes(file_data[\"file_content\"])\n    file_name = file_info[\"name\"]\n    mime_type = file_info[\"mimeType\"]\n    \n    print(f\"Processing file: {file_name} ({mime_type})\")\n    \n    extracted_text = \"\"\n    \n    try:\n        if mime_type == \"application/pdf\":\n            # Handle PDF files\n            with tempfile.NamedTemporaryFile(suffix=\".pdf\", delete=False) as temp_pdf:\n                temp_pdf.write(file_content)\n                temp_pdf_path = temp_pdf.name\n            \n            try:\n                # Open PDF and extract text from each page\n                pdf_document = fitz.open(temp_pdf_path)\n                \n                for page_num in range(pdf_document.page_count):\n                    page = pdf_document[page_num]\n                    \n                    # Convert page to image\n                    pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x zoom for better OCR\n                    img_data = pix.tobytes(\"png\")\n                    \n                    # Use Tesseract on the image\n                    image = Image.open(io.BytesIO(img_data))\n                    page_text = pytesseract.image_to_string(image, config='--psm 6')\n                    \n                    if page_text.strip():\n                        extracted_text += f\"--- Page {page_num + 1} ---\\n{page_text}\\n\\n\"\n                \n                pdf_document.close()\n                \n            finally:\n                # Clean up temporary file\n                os.unlink(temp_pdf_path)\n                \n        else:\n            # Handle image files (JPEG, PNG, etc.)\n            image = Image.open(io.BytesIO(file_content))\n            \n            # Convert to RGB if necessary (for PNG with transparency, etc.)\n            if image.mode != 'RGB':\n                image = image.convert('RGB')\n            \n            # Use Tesseract to extract text\n            extracted_text = pytesseract.image_to_string(image, config='--psm 6')\n    \n    except Exception as e:\n        print(f\"Error processing {file_name}: {str(e)}\")\n        extracted_text = f\"Error extracting text: {str(e)}\"\n    \n    # Clean up the extracted text\n    cleaned_text = extracted_text.strip()\n    \n    print(f\"Extracted {len(cleaned_text)} characters from {file_name}\")\n    \n    return {\n        \"file_info\": file_info,\n        \"extracted_text\": cleaned_text,\n        \"text_length\": len(cleaned_text),\n        \"processing_status\": \"success\" if cleaned_text and not cleaned_text.startswith(\"Error\") else \"failed\"\n    }","language":"python3","input_transforms":{"file_data":{"expr":"results.c","type":"javascript"}}},"summary":"Extract text with Tesseract"},{"id":"e","value":{"type":"rawscript","assets":[],"content":"import * as wmill from \"windmill-client\"\n\nexport async function main(\n  slack_resource: RT.Slack,\n  channel: string,\n  ocr_result: {\n    file_info: { id: string, name: string, mimeType: string, modifiedTime: string, webViewLink: string },\n    extracted_text: string,\n    text_length: number,\n    processing_status: string\n  }\n) {\n  const { file_info, extracted_text, text_length, processing_status } = ocr_result;\n  \n  // Prepare the message\n  let message = `📄 *New Expense Document Processed*\\n\\n`;\n  message += `*File:* ${file_info.name}\\n`;\n  message += `*Modified:* ${new Date(file_info.modifiedTime).toLocaleString()}\\n`;\n  message += `*Status:* ${processing_status === 'success' ? '✅ Success' : '❌ Failed'}\\n`;\n  message += `*Text Length:* ${text_length} characters\\n`;\n  message += `*View File:* <${file_info.webViewLink}|Open in Google Drive>\\n\\n`;\n  \n  if (processing_status === 'success' && extracted_text) {\n    // Truncate text if too long for Slack\n    const maxTextLength = 1500;\n    const displayText = extracted_text.length > maxTextLength \n      ? extracted_text.substring(0, maxTextLength) + \"...\\n\\n_[Text truncated - full text available in file]_\"\n      : extracted_text;\n    \n    message += `*Extracted Text:*\\n\\`\\`\\`\\n${displayText}\\n\\`\\`\\``;\n  } else {\n    message += `*Error:* ${extracted_text || 'Failed to extract text'}`;\n  }\n  \n  // Send message to Slack\n  const response = await fetch('https://slack.com/api/chat.postMessage', {\n    method: 'POST',\n    headers: {\n      'Authorization': `Bearer ${slack_resource.token}`,\n      'Content-Type': 'application/json'\n    },\n    body: JSON.stringify({\n      channel: channel,\n      text: message,\n      unfurl_links: false,\n      unfurl_media: false\n    })\n  });\n  \n  const result = await response.json();\n  \n  if (!result.ok) {\n    throw new Error(`Failed to send Slack message: ${result.error}`);\n  }\n  \n  console.log(`Sent Slack notification for ${file_info.name} to ${channel}`);\n  \n  return {\n    message_sent: true,\n    channel: channel,\n    file_name: file_info.name,\n    timestamp: result.ts,\n    processing_status: processing_status\n  };\n}","language":"bun","input_transforms":{"channel":{"expr":"flow_input.slack_channel","type":"javascript","value":""},"ocr_result":{"expr":"results.d","type":"javascript"},"slack_resource":{"expr":"flow_input.slack_resource","type":"javascript"}}},"summary":"Send Slack notification"}],"iterator":{"expr":"results.a.files","type":"javascript"},"parallel":false,"skip_failures":true}}]},"schema":{"type":"object","order":["gdrive_resource","folder_id","slack_resource","slack_channel","file_extensions"],"$schema":"https://json-schema.org/draft/2020-12/schema","required":["gdrive_resource","slack_resource"],"properties":{"folder_id":{"type":"string","default":"","description":"Google Drive folder ID to monitor for new expense files"},"slack_channel":{"type":"string","default":"#expenses","description":"Slack channel to send notifications to"},"slack_resource":{"type":"object","format":"resource-slack","description":"Slack resource for sending notifications"},"file_extensions":{"type":"array","items":{"type":"string"},"default":["jpg","jpeg","png","pdf"],"description":"File extensions to process (e.g., jpg, png, pdf)"},"gdrive_resource":{"type":"object","format":"resource-gdrive","description":"Google Drive resource for authentication"}}},"description":"","recording":null,"vcreated_at":"2025-08-01T12:20:34.482Z","vcreated_by":"henri186","comments":[]}}