Ctrl K
Edits history of script submission #22233 for ' Scrape Single URL (apify)'

bun
Approved version
import { ApifyClient } from 'apify-client@^2.19.0';

type ApifyApiKey = {
  api_key: string;
};

type Apify = {
  token: string;
};

const createClient = (api_key?: ApifyApiKey, oauth_token?: Apify): ApifyClient => {
  const token = oauth_token?.token ?? api_key?.api_key;
  if (!token) {
    throw new Error('Missing Apify API key or OAuth token');
  }

  return new ApifyClient({
    token: token,
    requestInterceptors: [
      (request) => {
        if (!request.headers) {
          request.headers = {};
        }
        request.headers['x-apify-integration-platform'] = 'windmill';
        return request;
      },
    ],
  });
};

async function pollRunStatus(
  client: ApifyClient,
  runId: string,
  options: { throwIfNotSucceeded: boolean; } = { throwIfNotSucceeded: false }
): Promise<any> {
  let status = '';
  let runData: any;
  while (true) {
    runData = await client.run(runId).get();
    status = runData.status;
    if (['SUCCEEDED', 'FAILED', 'ABORTED', 'TIMED-OUT'].includes(status)) break;
    await new Promise((res) => setTimeout(res, 1000));
  }

  if (options.throwIfNotSucceeded && status !== 'SUCCEEDED') {
    throw new Error(`Actor run did not succeed: ${status}`);
  }

  return runData;
}

const WEB_CONTENT_SCRAPER_ACTOR_ID = 'aYG0l9s7dbB7j3gbS';

type CrawlerType =
  | 'cheerio'
  | 'jsdom'
  | 'playwright:adaptive'
  | 'playwright:firefox';

export async function main(
  url: string,
  crawlerType: CrawlerType = 'cheerio',
  api_key?: ApifyApiKey,
  oauth_token?: Apify,
) {
  const client = createClient(api_key, oauth_token);

  try {
    const input: Record<string, any> = {
      startUrls: [{ url }],
      crawlerType,
      maxCrawlDepth: 0,
      maxCrawlPages: 1,
      maxResults: 1,
      proxyConfiguration: { useApifyProxy: true },
      removeCookieWarnings: true,
      saveHtml: true,
      saveMarkdown: true,
    };

    // 1. Start actor run
    const run = await client
      .actor(WEB_CONTENT_SCRAPER_ACTOR_ID)
      .call(input, { waitSecs: 0 });
    const runId = run.id;
    if (!runId) {
      return { error: 'No run ID returned from actor run' };
    }

    // 2. Poll for terminal status
    const lastRunData = await pollRunStatus(client, runId, { throwIfNotSucceeded: true });
    const defaultDatasetId = lastRunData.defaultDatasetId;
    if (!defaultDatasetId) {
      return { error: 'No dataset ID returned from actor run' };
    }

    // 3. Fetch first item from dataset
    const items = await client.dataset(defaultDatasetId).listItems();
    if (!Array.isArray(items.items) || items.items.length === 0) {
      return { error: 'No items found in dataset' };
    }

    const wccResultItem = items.items[0];

    delete wccResultItem.text;

    return wccResultItem;
  } catch (error: any) {
    return {
      error: `Failed to scrape URL. Reason: ${error.message}`,
    };
  }
}
Submitted by jakub.drobnik222 232 days ago
← Back to the list of replies