import { ApifyClient } from 'apify-client@^2.19.0';
type ApifyApiKey = {
api_key: string;
};
type Apify = {
token: string;
};
const createClient = (api_key?: ApifyApiKey, oauth_token?: Apify): ApifyClient => {
const token = oauth_token?.token ?? api_key?.api_key;
if (!token) {
throw new Error('Missing Apify API key or OAuth token');
}
return new ApifyClient({
token: token,
requestInterceptors: [
(request) => {
if (!request.headers) {
request.headers = {};
}
request.headers['x-apify-integration-platform'] = 'windmill';
return request;
},
],
});
};
async function pollRunStatus(
client: ApifyClient,
runId: string,
options: { throwIfNotSucceeded: boolean; } = { throwIfNotSucceeded: false }
): Promise<any> {
let status = '';
let runData: any;
while (true) {
runData = await client.run(runId).get();
status = runData.status;
if (['SUCCEEDED', 'FAILED', 'ABORTED', 'TIMED-OUT'].includes(status)) break;
await new Promise((res) => setTimeout(res, 1000));
}
if (options.throwIfNotSucceeded && status !== 'SUCCEEDED') {
throw new Error(`Actor run did not succeed: ${status}`);
}
return runData;
}
const WEB_CONTENT_SCRAPER_ACTOR_ID = 'aYG0l9s7dbB7j3gbS';
type CrawlerType =
| 'cheerio'
| 'jsdom'
| 'playwright:adaptive'
| 'playwright:firefox';
export async function main(
url: string,
crawlerType: CrawlerType = 'cheerio',
api_key?: ApifyApiKey,
oauth_token?: Apify,
) {
const client = createClient(api_key, oauth_token);
try {
const input: Record<string, any> = {
startUrls: [{ url }],
crawlerType,
maxCrawlDepth: 0,
maxCrawlPages: 1,
maxResults: 1,
proxyConfiguration: { useApifyProxy: true },
removeCookieWarnings: true,
saveHtml: true,
saveMarkdown: true,
};
// 1. Start actor run
const run = await client
.actor(WEB_CONTENT_SCRAPER_ACTOR_ID)
.call(input, { waitSecs: 0 });
const runId = run.id;
if (!runId) {
return { error: 'No run ID returned from actor run' };
}
// 2. Poll for terminal status
const lastRunData = await pollRunStatus(client, runId, { throwIfNotSucceeded: true });
const defaultDatasetId = lastRunData.defaultDatasetId;
if (!defaultDatasetId) {
return { error: 'No dataset ID returned from actor run' };
}
// 3. Fetch first item from dataset
const items = await client.dataset(defaultDatasetId).listItems();
if (!Array.isArray(items.items) || items.items.length === 0) {
return { error: 'No items found in dataset' };
}
const wccResultItem = items.items[0];
delete wccResultItem.text;
return wccResultItem;
} catch (error: any) {
return {
error: `Failed to scrape URL. Reason: ${error.message}`,
};
}
}
Submitted by jakub.drobnik222 88 days ago