0

Page text by URL

by
Published Nov 5, 2024

Returns the visible text content of a webpage specified by the URL. Can be used to feed data to GPT or other LLM models. The response can be in plain text, JSON, or XML format based on the text_format parameter. Proxies and Chromium JavaScript rendering are used for page retrieval and processing. Returns JSON on error.

Script webscrapingai Verified

The script

Submitted by hugo697 Bun
Verified 581 days ago
1
//native
2
type Webscrapingai = {
3
	apiKey: string
4
}
5
/**
6
 * Page text by URL
7
 * Returns the visible text content of a webpage specified by the URL. Can be used to feed data to GPT or other LLM models. The response can be in plain text, JSON, or XML format based on the text_format parameter. Proxies and Chromium JavaScript rendering are used for page retrieval and processing. Returns JSON on error.
8
 */
9
export async function main(
10
	auth: Webscrapingai,
11
	text_format: 'plain' | 'xml' | 'json' | undefined,
12
	return_links: string | undefined,
13
	url: string | undefined,
14
	headers: any,
15
	timeout: string | undefined,
16
	js: string | undefined,
17
	js_timeout: string | undefined,
18
	wait_for: string | undefined,
19
	proxy: 'datacenter' | 'residential' | undefined,
20
	country: 'us' | 'gb' | 'de' | 'it' | 'fr' | 'ca' | 'es' | 'ru' | 'jp' | 'kr' | 'in' | undefined,
21
	custom_proxy: string | undefined,
22
	device: 'desktop' | 'mobile' | 'tablet' | undefined,
23
	error_on_404: string | undefined,
24
	error_on_redirect: string | undefined,
25
	js_script: string | undefined
26
) {
27
	const url_ = new URL(`https://api.webscraping.ai/text`)
28

29
	url_.searchParams.append('api_key', auth.apiKey)
30

31
	for (const [k, v] of [
32
		['text_format', text_format],
33
		['return_links', return_links],
34
		['url', url],
35
		['timeout', timeout],
36
		['js', js],
37
		['js_timeout', js_timeout],
38
		['wait_for', wait_for],
39
		['proxy', proxy],
40
		['country', country],
41
		['custom_proxy', custom_proxy],
42
		['device', device],
43
		['error_on_404', error_on_404],
44
		['error_on_redirect', error_on_redirect],
45
		['js_script', js_script]
46
	]) {
47
		if (v !== undefined && v !== '' && k !== undefined) {
48
			url_.searchParams.append(k, v)
49
		}
50
	}
51
	encodeParams({ headers }).forEach((v, k) => {
52
		if (v !== undefined && v !== '') {
53
			url_.searchParams.append(k, v)
54
		}
55
	})
56
	const response = await fetch(url_, {
57
		method: 'GET',
58
		body: undefined
59
	})
60
	if (!response.ok) {
61
		const text = await response.text()
62
		throw new Error(`${response.status} ${text}`)
63
	}
64
	return await response.json()
65
}
66

67
function encodeParams(o: any) {
68
	function iter(o: any, path: string) {
69
		if (Array.isArray(o)) {
70
			o.forEach(function (a) {
71
				iter(a, path + '[]')
72
			})
73
			return
74
		}
75
		if (o !== null && typeof o === 'object') {
76
			Object.keys(o).forEach(function (k) {
77
				iter(o[k], path + '[' + k + ']')
78
			})
79
			return
80
		}
81
		data.push(path + '=' + o)
82
	}
83
	const data: string[] = []
84
	Object.keys(o).forEach(function (k) {
85
		if (o[k] !== undefined) {
86
			iter(o[k], k)
87
		}
88
	})
89
	return new URLSearchParams(data.join('&'))
90
}
91