Ctrl K

Scrape Single URL

Published Dec 5, 2025

Scrape a single URL using the Apify Website Content Crawler Actor and get its content as text, markdown, and HTML.

Script apify Verified

Use in Windmill

The script

Submitted by jakub.drobnik222 Bun

Verified 204 days ago

All edits

Permalink

import { ApifyClient } from 'apify-client@^2.19.0';

type ApifyApiKey = {
  api_key: string;
};

type Apify = {
  token: string;
};

const createClient = (api_key?: ApifyApiKey, oauth_token?: Apify): ApifyClient => {
  const token = oauth_token?.token ?? api_key?.api_key;
  if (!token) {
    throw new Error('Missing Apify API key or OAuth token');
  }

  return new ApifyClient({
    token: token,
    requestInterceptors: [
      (request) => {
        if (!request.headers) {
          request.headers = {};
        }
        request.headers['x-apify-integration-platform'] = 'windmill';
        return request;
      },
    ],
  });
};

async function pollRunStatus(
  client: ApifyClient,
  runId: string,
  options: { throwIfNotSucceeded: boolean; } = { throwIfNotSucceeded: false }
): Promise<any> {
  let status = '';
  let runData: any;
  while (true) {
    runData = await client.run(runId).get();
    status = runData.status;
    if (['SUCCEEDED', 'FAILED', 'ABORTED', 'TIMED-OUT'].includes(status)) break;
    await new Promise((res) => setTimeout(res, 1000));
  }

  if (options.throwIfNotSucceeded && status !== 'SUCCEEDED') {
    throw new Error(`Actor run did not succeed: ${status}`);
  }

  return runData;
}

const WEB_CONTENT_SCRAPER_ACTOR_ID = 'aYG0l9s7dbB7j3gbS';

type CrawlerType =
  | 'cheerio'
  | 'jsdom'
  | 'playwright:adaptive'
  | 'playwright:firefox';

export async function main(
  url: string,
  crawlerType: CrawlerType = 'cheerio',
  api_key?: ApifyApiKey,
  oauth_token?: Apify,
) {
  const client = createClient(api_key, oauth_token);

  try {
    const input: Record<string, any> = {
      startUrls: [{ url }],
      crawlerType,
      maxCrawlDepth: 0,
      maxCrawlPages: 1,
      maxResults: 1,
      proxyConfiguration: { useApifyProxy: true },
      removeCookieWarnings: true,
      saveHtml: true,
      saveMarkdown: true,
    };

    // 1. Start actor run
    const run = await client
      .actor(WEB_CONTENT_SCRAPER_ACTOR_ID)
      .call(input, { waitSecs: 0 });
    const runId = run.id;
    if (!runId) {
      return { error: 'No run ID returned from actor run' };
    }

    // 2. Poll for terminal status
    const lastRunData = await pollRunStatus(client, runId, { throwIfNotSucceeded: true });
    const defaultDatasetId = lastRunData.defaultDatasetId;
    if (!defaultDatasetId) {
      return { error: 'No dataset ID returned from actor run' };
    }

    // 3. Fetch first item from dataset
    const items = await client.dataset(defaultDatasetId).listItems();
    if (!Array.isArray(items.items) || items.items.length === 0) {
      return { error: 'No items found in dataset' };
    }

    const wccResultItem = items.items[0];

    delete wccResultItem.text;

    return wccResultItem;
  } catch (error: any) {
    return {
      error: `Failed to scrape URL. Reason: ${error.message}`,
    };
  }
}


`1`	`import { ApifyClient } from 'apify-client@^2.19.0';`
`2`
`3`	`type ApifyApiKey = {`
`4`	`api_key: string;`
`5`	`};`
`6`
`7`	`type Apify = {`
`8`	`token: string;`
`9`	`};`
`10`
`11`	`const createClient = (api_key?: ApifyApiKey, oauth_token?: Apify): ApifyClient => {`
`12`	`const token = oauth_token?.token ?? api_key?.api_key;`
`13`	`if (!token) {`
`14`	`throw new Error('Missing Apify API key or OAuth token');`
`15`	`}`
`16`
`17`	`return new ApifyClient({`
`18`	`token: token,`
`19`	`requestInterceptors: [`
`20`	`(request) => {`
`21`	`if (!request.headers) {`
`22`	`request.headers = {};`
`23`	`}`
`24`	`request.headers['x-apify-integration-platform'] = 'windmill';`
`25`	`return request;`
`26`	`},`
`27`	`],`
`28`	`});`
`29`	`};`
`30`
`31`	`async function pollRunStatus(`
`32`	`client: ApifyClient,`
`33`	`runId: string,`
`34`	`options: { throwIfNotSucceeded: boolean; } = { throwIfNotSucceeded: false }`
`35`	`): Promise<any> {`
`36`	`let status = '';`
`37`	`let runData: any;`
`38`	`while (true) {`
`39`	`runData = await client.run(runId).get();`
`40`	`status = runData.status;`
`41`	`if (['SUCCEEDED', 'FAILED', 'ABORTED', 'TIMED-OUT'].includes(status)) break;`
`42`	`await new Promise((res) => setTimeout(res, 1000));`
`43`	`}`
`44`
`45`	`if (options.throwIfNotSucceeded && status !== 'SUCCEEDED') {`
`46`	throw new Error(`Actor run did not succeed: ${status}`);
`47`	`}`
`48`
`49`	`return runData;`
`50`	`}`
`51`
`52`	`const WEB_CONTENT_SCRAPER_ACTOR_ID = 'aYG0l9s7dbB7j3gbS';`
`53`
`54`	`type CrawlerType =`
`55`	`\| 'cheerio'`
`56`	`\| 'jsdom'`
`57`	`\| 'playwright:adaptive'`
`58`	`\| 'playwright:firefox';`
`59`
`60`	`export async function main(`
`61`	`url: string,`
`62`	`crawlerType: CrawlerType = 'cheerio',`
`63`	`api_key?: ApifyApiKey,`
`64`	`oauth_token?: Apify,`
`65`	`) {`
`66`	`const client = createClient(api_key, oauth_token);`
`67`
`68`	`try {`
`69`	`const input: Record<string, any> = {`
`70`	`startUrls: [{ url }],`
`71`	`crawlerType,`
`72`	`maxCrawlDepth: 0,`
`73`	`maxCrawlPages: 1,`
`74`	`maxResults: 1,`
`75`	`proxyConfiguration: { useApifyProxy: true },`
`76`	`removeCookieWarnings: true,`
`77`	`saveHtml: true,`
`78`	`saveMarkdown: true,`
`79`	`};`
`80`
`81`	`// 1. Start actor run`
`82`	`const run = await client`
`83`	`.actor(WEB_CONTENT_SCRAPER_ACTOR_ID)`
`84`	`.call(input, { waitSecs: 0 });`
`85`	`const runId = run.id;`
`86`	`if (!runId) {`
`87`	`return { error: 'No run ID returned from actor run' };`
`88`	`}`
`89`
`90`	`// 2. Poll for terminal status`
`91`	`const lastRunData = await pollRunStatus(client, runId, { throwIfNotSucceeded: true });`
`92`	`const defaultDatasetId = lastRunData.defaultDatasetId;`
`93`	`if (!defaultDatasetId) {`
`94`	`return { error: 'No dataset ID returned from actor run' };`
`95`	`}`
`96`
`97`	`// 3. Fetch first item from dataset`
`98`	`const items = await client.dataset(defaultDatasetId).listItems();`
`99`	`if (!Array.isArray(items.items) \|\| items.items.length === 0) {`
`100`	`return { error: 'No items found in dataset' };`
`101`	`}`
`102`
`103`	`const wccResultItem = items.items[0];`
`104`
`105`	`delete wccResultItem.text;`
`106`
`107`	`return wccResultItem;`
`108`	`} catch (error: any) {`
`109`	`return {`
`110`	error: `Failed to scrape URL. Reason: ${error.message}`,
`111`	`};`
`112`	`}`
`113`	`}`
`114`