Edits history of script submission #22233 for ' Scrape Single URL (apify)'

  • bun
    One script reply has been approved by the moderators
    Ap­pro­ved
    import { ApifyClient } from 'apify-client@^2.19.0';
    
    type ApifyApiKey = {
      api_key: string;
    };
    
    type Apify = {
      token: string;
    };
    
    const createClient = (api_key?: ApifyApiKey, oauth_token?: Apify): ApifyClient => {
      const token = oauth_token?.token ?? api_key?.api_key;
      if (!token) {
        throw new Error('Missing Apify API key or OAuth token');
      }
    
      return new ApifyClient({
        token: token,
        requestInterceptors: [
          (request) => {
            if (!request.headers) {
              request.headers = {};
            }
            request.headers['x-apify-integration-platform'] = 'windmill';
            return request;
          },
        ],
      });
    };
    
    async function pollRunStatus(
      client: ApifyClient,
      runId: string,
      options: { throwIfNotSucceeded: boolean; } = { throwIfNotSucceeded: false }
    ): Promise<any> {
      let status = '';
      let runData: any;
      while (true) {
        runData = await client.run(runId).get();
        status = runData.status;
        if (['SUCCEEDED', 'FAILED', 'ABORTED', 'TIMED-OUT'].includes(status)) break;
        await new Promise((res) => setTimeout(res, 1000));
      }
    
      if (options.throwIfNotSucceeded && status !== 'SUCCEEDED') {
        throw new Error(`Actor run did not succeed: ${status}`);
      }
    
      return runData;
    }
    
    const WEB_CONTENT_SCRAPER_ACTOR_ID = 'aYG0l9s7dbB7j3gbS';
    
    type CrawlerType =
      | 'cheerio'
      | 'jsdom'
      | 'playwright:adaptive'
      | 'playwright:firefox';
    
    export async function main(
      url: string,
      crawlerType: CrawlerType = 'cheerio',
      api_key?: ApifyApiKey,
      oauth_token?: Apify,
    ) {
      const client = createClient(api_key, oauth_token);
    
      try {
        const input: Record<string, any> = {
          startUrls: [{ url }],
          crawlerType,
          maxCrawlDepth: 0,
          maxCrawlPages: 1,
          maxResults: 1,
          proxyConfiguration: { useApifyProxy: true },
          removeCookieWarnings: true,
          saveHtml: true,
          saveMarkdown: true,
        };
    
        // 1. Start actor run
        const run = await client
          .actor(WEB_CONTENT_SCRAPER_ACTOR_ID)
          .call(input, { waitSecs: 0 });
        const runId = run.id;
        if (!runId) {
          return { error: 'No run ID returned from actor run' };
        }
    
        // 2. Poll for terminal status
        const lastRunData = await pollRunStatus(client, runId, { throwIfNotSucceeded: true });
        const defaultDatasetId = lastRunData.defaultDatasetId;
        if (!defaultDatasetId) {
          return { error: 'No dataset ID returned from actor run' };
        }
    
        // 3. Fetch first item from dataset
        const items = await client.dataset(defaultDatasetId).listItems();
        if (!Array.isArray(items.items) || items.items.length === 0) {
          return { error: 'No items found in dataset' };
        }
    
        const wccResultItem = items.items[0];
    
        delete wccResultItem.text;
    
        return wccResultItem;
      } catch (error: any) {
        return {
          error: `Failed to scrape URL. Reason: ${error.message}`,
        };
      }
    }
    

    Submitted by jakub.drobnik222 172 days ago