1 | import * as wmill from "https://deno.land/x/windmill@v1.84.1/mod.ts"; |
2 |
|
3 | import puppeteer from "npm:puppeteer"; |
4 |
|
5 | const loadingTimeToWait = 2000; |
6 |
|
7 |
|
8 | const commonContactTranslations = [ |
9 | "contact", |
10 | "contacto", |
11 | "lianxi", |
12 | "ittisal", |
13 | "sampark", |
14 | "yogayoga", |
15 | "kontakt", |
16 | "kontakuto", |
17 | "lianlo", |
18 | "sampradincandi", |
19 | "lienhe", |
20 | "totarpu", |
21 | "temas", |
22 | "yeollak", |
23 | "rabtah", |
24 | "tiddtawdt", |
25 | "bandhappetal", |
26 | "seikswaya", |
27 | "alaqe", |
28 | "taanu", |
29 | "qaba", |
30 | "samparka", |
31 | "aloqa", |
32 | "kaantakat", |
33 | "yaginnu", |
34 | "igbesiaye", |
35 | "ngwucha", |
36 | "thintana", |
37 | "laxiriir", |
38 | "vohitra", |
39 | "tektoun", |
40 | "bwino", |
41 | "sampŕadiṁcaṇḍi", |
42 | "toṭarpu", |
43 | "saṁparka", |
44 | "seikswa ya", |
45 | "kaanṭakṭ", |
46 | "igbesi aye", |
47 | "ngwụcha", |
48 | "sambandha", |
49 | "kysymys", |
50 | "kontak", |
51 | "kontakte", |
52 | "kumunikasyon", |
53 | "contactare", |
54 | "kontakta", |
55 | "kontaktu", |
56 | "kontaktní", |
57 | "kontaktoplysninger", |
58 | "kontakti", |
59 | "kontrakt", |
60 | "kapcsolat", |
61 | "atnaujinti", |
62 | "kontaktas", |
63 | "kontakto", |
64 | "kontaktinis", |
65 | "kontaktai", |
66 | "kontaktni", |
67 | "kontaktirati", |
68 | "kontaktné", |
69 | "kontaktujte", |
70 | "kontaktom", |
71 | "kontaktuppgifter", |
72 | "kapcsolattartó", |
73 | "samband", |
74 | "kontaktowa", |
75 | "kontaktul", |
76 | "kontaktim", |
77 | "kontaktini", |
78 | "kontaktný", |
79 | "kontratta", |
80 | "contatto", |
81 | ]; |
82 |
|
83 |
|
84 |
|
85 |
|
86 | async function findContactURL(page: puppeteer.Page) { |
87 | const contactLink = await page.evaluate( |
88 | async (translations): Promise<string | null> => { |
89 | for (const contactTranslation of translations) { |
90 | try { |
91 | const contactLinkElement = await page.waitForSelector( |
92 | `a[href*="${contactTranslation}"]`, |
93 | ); |
94 |
|
95 | if (contactLinkElement && "href" in contactLinkElement) { |
96 | return contactLinkElement.href as string; |
97 | } |
98 | } catch (error) { |
99 | continue; |
100 | } |
101 | } |
102 |
|
103 | return null; |
104 | }, |
105 | commonContactTranslations, |
106 | ); |
107 |
|
108 | return contactLink; |
109 | } |
110 |
|
111 | function extractEmail(htmlContent: string) { |
112 | const emailPattern = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g; |
113 | const emails = htmlContent.match(emailPattern) || []; |
114 | return emails; |
115 | } |
116 |
|
117 | function isValidURL(url: string) { |
118 | try { |
119 | new URL(url); |
120 | return true; |
121 | } catch (error) { |
122 | return false; |
123 | } |
124 | } |
125 |
|
126 | interface Result { |
127 | email: string | null; |
128 | contact_page: string | null; |
129 | } |
130 |
|
131 | export async function main(websiteUrl: string, googleSheetId?: string) { |
132 | let result: Result = { contact_page: null, email: null }; |
133 |
|
134 | if (!isValidURL(websiteUrl)) { |
135 | return result; |
136 | } |
137 |
|
138 | const browser = await puppeteer.launch(); |
139 | const page = await browser.newPage(); |
140 |
|
141 | try { |
142 | await page.goto(websiteUrl); |
143 | } catch (error) { |
144 | return result; |
145 | } |
146 |
|
147 | await page.waitForTimeout(loadingTimeToWait); |
148 |
|
149 | const contactURL = await findContactURL(page); |
150 |
|
151 | |
152 | |
153 | if (contactURL) { |
154 | try { |
155 | await page.goto(contactURL); |
156 | } catch (error) { |
157 | return result; |
158 | } |
159 |
|
160 | await page.waitForTimeout(loadingTimeToWait); |
161 |
|
162 | const contactHtmlContent = await page.content(); |
163 | const emails = extractEmail(contactHtmlContent); |
164 |
|
165 | if (emails.length > 0 && emails[0]) { |
166 | result = { |
167 | contact_page: contactURL, |
168 | email: emails[0], |
169 | }; |
170 | } |
171 | } |
172 |
|
173 | await browser.close(); |
174 |
|
175 | return result; |
176 | } |
177 |
|