Scrape Single URL
One script reply has been approved by the moderators Verified

Scrape a single URL using the Apify Website Content Crawler Actor and get its content as text, markdown, and HTML.

Created by jakub.drobnik222 88 days ago Picked 9 times
Submitted by jakub.drobnik222 Bun
Verified 88 days ago
1
import { ApifyClient } from 'apify-client@^2.19.0';
2

3
type ApifyApiKey = {
4
  api_key: string;
5
};
6

7
type Apify = {
8
  token: string;
9
};
10

11
const createClient = (api_key?: ApifyApiKey, oauth_token?: Apify): ApifyClient => {
12
  const token = oauth_token?.token ?? api_key?.api_key;
13
  if (!token) {
14
    throw new Error('Missing Apify API key or OAuth token');
15
  }
16

17
  return new ApifyClient({
18
    token: token,
19
    requestInterceptors: [
20
      (request) => {
21
        if (!request.headers) {
22
          request.headers = {};
23
        }
24
        request.headers['x-apify-integration-platform'] = 'windmill';
25
        return request;
26
      },
27
    ],
28
  });
29
};
30

31
async function pollRunStatus(
32
  client: ApifyClient,
33
  runId: string,
34
  options: { throwIfNotSucceeded: boolean; } = { throwIfNotSucceeded: false }
35
): Promise<any> {
36
  let status = '';
37
  let runData: any;
38
  while (true) {
39
    runData = await client.run(runId).get();
40
    status = runData.status;
41
    if (['SUCCEEDED', 'FAILED', 'ABORTED', 'TIMED-OUT'].includes(status)) break;
42
    await new Promise((res) => setTimeout(res, 1000));
43
  }
44

45
  if (options.throwIfNotSucceeded && status !== 'SUCCEEDED') {
46
    throw new Error(`Actor run did not succeed: ${status}`);
47
  }
48

49
  return runData;
50
}
51

52
const WEB_CONTENT_SCRAPER_ACTOR_ID = 'aYG0l9s7dbB7j3gbS';
53

54
type CrawlerType =
55
  | 'cheerio'
56
  | 'jsdom'
57
  | 'playwright:adaptive'
58
  | 'playwright:firefox';
59

60
export async function main(
61
  url: string,
62
  crawlerType: CrawlerType = 'cheerio',
63
  api_key?: ApifyApiKey,
64
  oauth_token?: Apify,
65
) {
66
  const client = createClient(api_key, oauth_token);
67

68
  try {
69
    const input: Record<string, any> = {
70
      startUrls: [{ url }],
71
      crawlerType,
72
      maxCrawlDepth: 0,
73
      maxCrawlPages: 1,
74
      maxResults: 1,
75
      proxyConfiguration: { useApifyProxy: true },
76
      removeCookieWarnings: true,
77
      saveHtml: true,
78
      saveMarkdown: true,
79
    };
80

81
    // 1. Start actor run
82
    const run = await client
83
      .actor(WEB_CONTENT_SCRAPER_ACTOR_ID)
84
      .call(input, { waitSecs: 0 });
85
    const runId = run.id;
86
    if (!runId) {
87
      return { error: 'No run ID returned from actor run' };
88
    }
89

90
    // 2. Poll for terminal status
91
    const lastRunData = await pollRunStatus(client, runId, { throwIfNotSucceeded: true });
92
    const defaultDatasetId = lastRunData.defaultDatasetId;
93
    if (!defaultDatasetId) {
94
      return { error: 'No dataset ID returned from actor run' };
95
    }
96

97
    // 3. Fetch first item from dataset
98
    const items = await client.dataset(defaultDatasetId).listItems();
99
    if (!Array.isArray(items.items) || items.items.length === 0) {
100
      return { error: 'No items found in dataset' };
101
    }
102

103
    const wccResultItem = items.items[0];
104

105
    delete wccResultItem.text;
106

107
    return wccResultItem;
108
  } catch (error: any) {
109
    return {
110
      error: `Failed to scrape URL. Reason: ${error.message}`,
111
    };
112
  }
113
}
114