{"flow":{"id":45,"summary":"Extract and Embed Documentation for Semantic Search","versions":[175,176,177,190,191,193,194,195],"created_by":"faton ramadani2","created_at":"2023-06-07T13:20:36.529Z","votes":0,"approved":false,"apps":["discord","openai","github","supabase"],"value":{"modules":[{"id":"a","value":{"lock":"","path":"","type":"rawscript","content":"import * as wmill from \"https://deno.land/x/windmill@v1.85.0/mod.ts\";\nimport { Octokit } from \"https://cdn.skypack.dev/@octokit/rest\";\n\ntype FileContent = { content: string; link: string };\n\nexport async function main(\n  gh_auth: wmill.Resource<\"github\">,\n  owner: string,\n  repo: string,\n  path?: string,\n  ref?: string,\n  result_format: \"github_object\" | \"json\" = \"github_object\",\n): Promise<FileContent[]> {\n  const octokit = new Octokit({ auth: gh_auth.token });\n\n  const response = await octokit.request(\n    `GET /repos/{owner}/{repo}/contents/${path}${ref ? \"?ref=\" + ref : \"\"}`,\n    {\n      owner,\n      repo,\n      headers: {\n        \"X-GitHub-Api-Version\": \"2022-11-28\",\n        Accept: `application/${\n          result_format === \"json\" ? \"vnd.github+json\" : \"vnd.github.object\"\n        }`,\n      },\n    },\n  );\n\n  const entries = response.data.entries;\n\n  const fileContents: FileContent[] = [];\n\n  for (const entry of entries) {\n    if (entry.type === \"file\") {\n      const isMarkdown = entry.name.endsWith(\".md\");\n      const isMDX = entry.name.endsWith(\".mdx\");\n\n      if (isMarkdown || isMDX) {\n        const link = getDocusaurusPathFromGithub(entry.path);\n\n        const contentResponse = await octokit.request(\n          \"GET /repos/{owner}/{repo}/contents/{path}\",\n          {\n            owner,\n            repo,\n            path: entry.path,\n            headers: {\n              \"X-GitHub-Api-Version\": \"2022-11-28\",\n              Accept: \"application/vnd.github.v3.raw\", // Request raw content of the file\n            },\n          },\n        );\n\n        const content = contentResponse.data as string;\n\n        fileContents.push({\n          content,\n          link,\n        });\n      }\n    } else if (entry.type === \"dir\") {\n      // Recursively process directories\n      const dirContents = await main(\n        gh_auth,\n        owner,\n        repo,\n        entry.path,\n        ref,\n        result_format,\n      );\n      fileContents.push(...dirContents);\n    }\n  }\n\n  return fileContents;\n}\n\nfunction getDocusaurusPathFromGithub(githubUrl: string): string {\n  const match = githubUrl.match(/docs\\/(.+\\.(md|mdx))/);\n  if (match) {\n    let filePath = match[1];\n    filePath = filePath.replace(/\\.(md|mdx)$/, \"\");\n\n    // Split the path into segments\n    let pathSegments = filePath.split(\"/\");\n\n    // Remove numbers and underscores from the beginning of each segment\n    pathSegments = pathSegments.map((segment) =>\n      segment.replace(/^[0-9]*_/, \"\")\n    );\n\n    // Reconstruct the path\n    filePath = pathSegments.join(\"/\");\n\n    return \"https://docs.windmill.dev/docs/\" + filePath;\n  }\n  return githubUrl;\n}\n","language":"deno","input_transforms":{"ref":{"type":"static","value":""},"path":{"type":"static","value":""},"repo":{"type":"static","value":"windmilldocs"},"owner":{"type":"static","value":"windmill-labs"},"gh_auth":{"type":"static","value":""},"result_format":{"type":"static","value":"github_object"}}},"summary":"Extract Markdown from Github"},{"id":"b","value":{"type":"forloopflow","modules":[{"id":"c","value":{"path":"hub/858/openai/create_embedding","type":"rawscript","content":"import type { Resource } from \"https://deno.land/x/windmill@v1.85.0/mod.ts\";\nimport { Configuration, OpenAIApi } from \"npm:openai@3.1.0\";\n\nexport async function main(\n  auth: Resource<\"openai\">,\n  prompt: string,\n  model: string = \"text-embedding-ada-002\",\n) {\n  try {\n    const configuration = new Configuration({\n      apiKey: auth.api_key,\n      organization: auth.organization_id,\n    });\n    const openai = new OpenAIApi(configuration);\n\n    const response = await openai.createEmbedding({\n      model,\n      input: prompt,\n    });\n\n    const [{ embedding }] = response.data.data;\n    return embedding;\n  } catch (error) {\n    // Handle the error\n    console.error(\"An error occurred:\", error);\n    // Return an appropriate response or throw a new error\n    return [];\n  }\n}\n","language":"deno","input_transforms":{"auth":{"type":"static","value":""},"model":{"type":"static","value":"text-embedding-ada-002"},"prompt":{"expr":"`${flow_input.iter.value.content}`","type":"javascript"}}},"summary":"Create Embedding"},{"id":"d","value":{"path":"hub/944/supabase/insert_data","type":"rawscript","content":"import { Resource } from \"https://deno.land/x/windmill@v1.85.0/mod.ts\";\nimport { refreshAndRetryIfExpired } from \"https://deno.land/x/windmill_helpers@v1.1.1/mod.ts\";\n\nexport async function main(\n  auth: Resource<\"supabase\">,\n  embedding: any,\n  document: string,\n  link: string,\n  token?: {\n    access: string;\n    refresh: string;\n    expires_at?: number;\n  },\n) {\n  return await refreshAndRetryIfExpired(auth, token, async (client) => {\n    const query: any = await client.from(\"documents\").insert({\n      content: document,\n      embedding,\n      link,\n    });\n\n    return query;\n  });\n}\n","language":"deno","input_transforms":{"auth":{"type":"static","value":""},"link":{"expr":"`${flow_input.iter.value.link}`","type":"javascript"},"token":{"type":"static","value":{"access":"","refresh":""}},"document":{"expr":"`${flow_input.iter.value.content}`","type":"javascript"},"embedding":{"expr":"results.c","type":"javascript"}}},"summary":"Insert data on Supabase"}],"iterator":{"expr":"results.a","type":"javascript"},"parallel":false,"skip_failures":true}}]},"schema":{"type":"object","$schema":"https://json-schema.org/draft/2020-12/schema","required":[],"properties":{}},"description":"This workflow automates the process of extracting documentation from a GitHub repository, creating text embeddings, and saving these along with the original text to a database.\n\nThe workflow is broken down into two main modules:\n\n1. The first module uses the GitHub API to retrieve documentation files (in .md or .mdx format) from the \"windmilldocs\" repository owned by \"windmill-labs\". It also processes the file content into Docusaurus paths.\n\n2. The second module operates in a loop for each of the extracted documentation files. It utilizes OpenAI's text-embedding model to create embeddings of the text and then stores these embeddings, along with the original text content and its link, into a Supabase database.\n\nThis is useful for creating a semantic search functionality in a web application or software documentation site. The text embeddings can be used to find the most relevant documents based on semantic similarity to a query.","recording":null,"vcreated_at":"2023-06-28T10:52:13.892Z","vcreated_by":"faton ramadani2","comments":[]}}