Ctrl K

web scraping example

Published Jul 19, 2024

Script github

Submitted by ksa.mo7md22 Python3

Created 703 days ago

from typing import List, Dict
import requests
from bs4 import BeautifulSoup


def main(url: str) -> List[Dict[str, str]]:
    # Send a GET request to the URL
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Cache-Control": "max-age=0",
        "TE": "Trailers",
        "DNT": "1",  # Do Not Track
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
    }
    response = requests.get(url, headers=headers)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the section with class "quickfinder"
    quickfinder_section = soup.find(class_="quickfinder")

    # If the section is found, get all 'a' tags within it
    if quickfinder_section:
        a_tags = quickfinder_section.find_all("a", recursive=True)
        results = [
            {"text": a.get_text(strip=True), "href": a.get("href")}
            for a in a_tags
            if a.get_text(strip=True)
        ]
    else:
        results = []

    return results


`1`	`from typing import List, Dict`
`2`	`import requests`
`3`	`from bs4 import BeautifulSoup`
`4`
`5`
`6`	`def main(url: str) -> List[Dict[str, str]]:`
`7`	`# Send a GET request to the URL`
`8`	`headers = {`
`9`	`"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",`
`10`	`"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",`
`11`	`"Accept-Language": "en-US,en;q=0.5",`
`12`	`"Accept-Encoding": "gzip, deflate, br",`
`13`	`"Connection": "keep-alive",`
`14`	`"Upgrade-Insecure-Requests": "1",`
`15`	`"Cache-Control": "max-age=0",`
`16`	`"TE": "Trailers",`
`17`	`"DNT": "1", # Do Not Track`
`18`	`"Sec-Fetch-Dest": "document",`
`19`	`"Sec-Fetch-Mode": "navigate",`
`20`	`"Sec-Fetch-Site": "none",`
`21`	`"Sec-Fetch-User": "?1",`
`22`	`}`
`23`	`response = requests.get(url, headers=headers)`
`24`
`25`	`# Parse the HTML content using BeautifulSoup`
`26`	`soup = BeautifulSoup(response.content, "html.parser")`
`27`
`28`	`# Find the section with class "quickfinder"`
`29`	`quickfinder_section = soup.find(class_="quickfinder")`
`30`
`31`	`# If the section is found, get all 'a' tags within it`
`32`	`if quickfinder_section:`
`33`	`a_tags = quickfinder_section.find_all("a", recursive=True)`
`34`	`results = [`
`35`	`{"text": a.get_text(strip=True), "href": a.get("href")}`
`36`	`for a in a_tags`
`37`	`if a.get_text(strip=True)`
`38`	`]`
`39`	`else:`
`40`	`results = []`
`41`
`42`	`return results`
`43`