Scrape Single URL
One script reply has been approved by the moderators Verified

Scrape a single URL using the Apify Website Content Crawler Actor and get its content as text, markdown, and HTML.

Created by jakub.drobnik222 108 days ago Picked 12 times
Submitted by jakub.drobnik222 Bun
Verified 108 days ago
1
import { ApifyClient } from 'apify-client@^2.19.0';
2

3
type ApifyApiKey = {
4
  api_key: string;
5
};
6

7
type Apify = {
8
  token: string;
9
};
10

11
const createClient = (api_key?: ApifyApiKey, oauth_token?: Apify): ApifyClient => {
12
  const token = oauth_token?.token ?? api_key?.api_key;
13
  if (!token) {
14
    throw new Error('Missing Apify API key or OAuth token');
15
  }
16

17
  return new ApifyClient({
18
    token: token,
19
    requestInterceptors: [
20
      (request) => {
21
        if (!request.headers) {
22
          request.headers = {};
23
        }
24
        request.headers['x-apify-integration-platform'] = 'windmill';
25
        return request;
26
      },
27
    ],
28
  });
29
};
30

31
async function pollRunStatus(
32
  client: ApifyClient,
33
  runId: string,
34
  options: { throwIfNotSucceeded: boolean; } = { throwIfNotSucceeded: false }
35
): Promise<any> {
36
  let status = '';
37
  let runData: any;
38
  while (true) {
39
    runData = await client.run(runId).get();
40
    status = runData.status;
41
    if (['SUCCEEDED', 'FAILED', 'ABORTED', 'TIMED-OUT'].includes(status)) break;
42
    await new Promise((res) => setTimeout(res, 1000));
43
  }
44

45
  if (options.throwIfNotSucceeded && status !== 'SUCCEEDED') {
46
    throw new Error(`Actor run did not succeed: ${status}`);
47
  }
48

49
  return runData;
50
}
51

52
const WEB_CONTENT_SCRAPER_ACTOR_ID = 'aYG0l9s7dbB7j3gbS';
53

54
type CrawlerType =
55
  | 'cheerio'
56
  | 'jsdom'
57
  | 'playwright:adaptive'
58
  | 'playwright:firefox';
59

60
export async function main(
61
  url: string,
62
  crawlerType: CrawlerType = 'cheerio',
63
  api_key?: ApifyApiKey,
64
  oauth_token?: Apify,
65
) {
66
  const client = createClient(api_key, oauth_token);
67

68
  try {
69
    const input: Record<string, any> = {
70
      startUrls: [{ url }],
71
      crawlerType,
72
      maxCrawlDepth: 0,
73
      maxCrawlPages: 1,
74
      maxResults: 1,
75
      proxyConfiguration: { useApifyProxy: true },
76
      removeCookieWarnings: true,
77
      saveHtml: true,
78
      saveMarkdown: true,
79
    };
80

81
    // 1. Start actor run
82
    const run = await client
83
      .actor(WEB_CONTENT_SCRAPER_ACTOR_ID)
84
      .call(input, { waitSecs: 0 });
85
    const runId = run.id;
86
    if (!runId) {
87
      return { error: 'No run ID returned from actor run' };
88
    }
89

90
    // 2. Poll for terminal status
91
    const lastRunData = await pollRunStatus(client, runId, { throwIfNotSucceeded: true });
92
    const defaultDatasetId = lastRunData.defaultDatasetId;
93
    if (!defaultDatasetId) {
94
      return { error: 'No dataset ID returned from actor run' };
95
    }
96

97
    // 3. Fetch first item from dataset
98
    const items = await client.dataset(defaultDatasetId).listItems();
99
    if (!Array.isArray(items.items) || items.items.length === 0) {
100
      return { error: 'No items found in dataset' };
101
    }
102

103
    const wccResultItem = items.items[0];
104

105
    delete wccResultItem.text;
106

107
    return wccResultItem;
108
  } catch (error: any) {
109
    return {
110
      error: `Failed to scrape URL. Reason: ${error.message}`,
111
    };
112
  }
113
}
114