import * as wmill from "https://deno.land/x/windmill@v1.84.1/mod.ts";
import puppeteer from "npm:puppeteer";
const loadingTimeToWait = 2000;
// 71 most common translations of "contact"
const commonContactTranslations = [
"contact",
"contacto",
"lianxi",
"ittisal",
"sampark",
"yogayoga",
"kontakt",
"kontakuto",
"lianlo",
"sampradincandi",
"lienhe",
"totarpu",
"temas",
"yeollak",
"rabtah",
"tiddtawdt",
"bandhappetal",
"seikswaya",
"alaqe",
"taanu",
"qaba",
"samparka",
"aloqa",
"kaantakat",
"yaginnu",
"igbesiaye",
"ngwucha",
"thintana",
"laxiriir",
"vohitra",
"tektoun",
"bwino",
"sampŕadiṁcaṇḍi",
"toṭarpu",
"saṁparka",
"seikswa ya",
"kaanṭakṭ",
"igbesi aye",
"ngwụcha",
"sambandha",
"kysymys",
"kontak",
"kontakte",
"kumunikasyon",
"contactare",
"kontakta",
"kontaktu",
"kontaktní",
"kontaktoplysninger",
"kontakti",
"kontrakt",
"kapcsolat",
"atnaujinti",
"kontaktas",
"kontakto",
"kontaktinis",
"kontaktai",
"kontaktni",
"kontaktirati",
"kontaktné",
"kontaktujte",
"kontaktom",
"kontaktuppgifter",
"kapcsolattartó",
"samband",
"kontaktowa",
"kontaktul",
"kontaktim",
"kontaktini",
"kontaktný",
"kontratta",
"contatto",
];
// Naive way of getting contact page
// It is not robust, since not all websites will follow this convention
// of having contact route
async function findContactURL(page: puppeteer.Page) {
const contactLink = await page.evaluate(
async (translations): Promise<string | null> => {
for (const contactTranslation of translations) {
try {
const contactLinkElement = await page.waitForSelector(
`a[href*="${contactTranslation}"]`,
);
if (contactLinkElement && "href" in contactLinkElement) {
return contactLinkElement.href as string;
}
} catch (error) {
continue;
}
}
return null;
},
commonContactTranslations,
);
return contactLink;
}
function extractEmail(htmlContent: string) {
const emailPattern = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
const emails = htmlContent.match(emailPattern) || [];
return emails;
}
function isValidURL(url: string) {
try {
new URL(url);
return true;
} catch (error) {
return false;
}
}
interface Result {
email: string | null;
contact_page: string | null;
}
export async function main(websiteUrl: string, googleSheetId?: string) {
let result: Result = { contact_page: null, email: null };
if (!isValidURL(websiteUrl)) {
return result;
}
const browser = await puppeteer.launch();
const page = await browser.newPage();
try {
await page.goto(websiteUrl);
} catch (error) {
return result;
}
await page.waitForTimeout(loadingTimeToWait);
const contactURL = await findContactURL(page);
// Naive approach worked and page with "contact"
// route was found
if (contactURL) {
try {
await page.goto(contactURL);
} catch (error) {
return result;
}
await page.waitForTimeout(loadingTimeToWait);
const contactHtmlContent = await page.content();
const emails = extractEmail(contactHtmlContent);
if (emails.length > 0 && emails[0]) {
result = {
contact_page: contactURL,
email: emails[0],
};
}
}
await browser.close();
return result;
}
Submitted by pesjak.matej93 489 days ago