1 | import { ApifyClient } from 'apify-client@^2.19.0'; |
2 |
|
3 | type ApifyApiKey = { |
4 | api_key: string; |
5 | }; |
6 |
|
7 | type Apify = { |
8 | token: string; |
9 | }; |
10 |
|
11 | const createClient = (api_key?: ApifyApiKey, oauth_token?: Apify): ApifyClient => { |
12 | const token = oauth_token?.token ?? api_key?.api_key; |
13 | if (!token) { |
14 | throw new Error('Missing Apify API key or OAuth token'); |
15 | } |
16 |
|
17 | return new ApifyClient({ |
18 | token: token, |
19 | requestInterceptors: [ |
20 | (request) => { |
21 | if (!request.headers) { |
22 | request.headers = {}; |
23 | } |
24 | request.headers['x-apify-integration-platform'] = 'windmill'; |
25 | return request; |
26 | }, |
27 | ], |
28 | }); |
29 | }; |
30 |
|
31 | async function pollRunStatus( |
32 | client: ApifyClient, |
33 | runId: string, |
34 | options: { throwIfNotSucceeded: boolean; } = { throwIfNotSucceeded: false } |
35 | ): Promise<any> { |
36 | let status = ''; |
37 | let runData: any; |
38 | while (true) { |
39 | runData = await client.run(runId).get(); |
40 | status = runData.status; |
41 | if (['SUCCEEDED', 'FAILED', 'ABORTED', 'TIMED-OUT'].includes(status)) break; |
42 | await new Promise((res) => setTimeout(res, 1000)); |
43 | } |
44 |
|
45 | if (options.throwIfNotSucceeded && status !== 'SUCCEEDED') { |
46 | throw new Error(`Actor run did not succeed: ${status}`); |
47 | } |
48 |
|
49 | return runData; |
50 | } |
51 |
|
52 | const WEB_CONTENT_SCRAPER_ACTOR_ID = 'aYG0l9s7dbB7j3gbS'; |
53 |
|
54 | type CrawlerType = |
55 | | 'cheerio' |
56 | | 'jsdom' |
57 | | 'playwright:adaptive' |
58 | | 'playwright:firefox'; |
59 |
|
60 | export async function main( |
61 | url: string, |
62 | crawlerType: CrawlerType = 'cheerio', |
63 | api_key?: ApifyApiKey, |
64 | oauth_token?: Apify, |
65 | ) { |
66 | const client = createClient(api_key, oauth_token); |
67 |
|
68 | try { |
69 | const input: Record<string, any> = { |
70 | startUrls: [{ url }], |
71 | crawlerType, |
72 | maxCrawlDepth: 0, |
73 | maxCrawlPages: 1, |
74 | maxResults: 1, |
75 | proxyConfiguration: { useApifyProxy: true }, |
76 | removeCookieWarnings: true, |
77 | saveHtml: true, |
78 | saveMarkdown: true, |
79 | }; |
80 |
|
81 | |
82 | const run = await client |
83 | .actor(WEB_CONTENT_SCRAPER_ACTOR_ID) |
84 | .call(input, { waitSecs: 0 }); |
85 | const runId = run.id; |
86 | if (!runId) { |
87 | return { error: 'No run ID returned from actor run' }; |
88 | } |
89 |
|
90 | |
91 | const lastRunData = await pollRunStatus(client, runId, { throwIfNotSucceeded: true }); |
92 | const defaultDatasetId = lastRunData.defaultDatasetId; |
93 | if (!defaultDatasetId) { |
94 | return { error: 'No dataset ID returned from actor run' }; |
95 | } |
96 |
|
97 | |
98 | const items = await client.dataset(defaultDatasetId).listItems(); |
99 | if (!Array.isArray(items.items) || items.items.length === 0) { |
100 | return { error: 'No items found in dataset' }; |
101 | } |
102 |
|
103 | const wccResultItem = items.items[0]; |
104 |
|
105 | delete wccResultItem.text; |
106 |
|
107 | return wccResultItem; |
108 | } catch (error: any) { |
109 | return { |
110 | error: `Failed to scrape URL. Reason: ${error.message}`, |
111 | }; |
112 | } |
113 | } |
114 |
|