web scraping example

Script github

by ksa.mo7md22 ยท 7/19/2024

  • Submitted by ksa.mo7md22 Python3
    Created 658 days ago
    1
    from typing import List, Dict
    2
    import requests
    3
    from bs4 import BeautifulSoup
    4
    
    
    5
    
    
    6
    def main(url: str) -> List[Dict[str, str]]:
    7
        # Send a GET request to the URL
    8
        headers = {
    9
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    10
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    11
            "Accept-Language": "en-US,en;q=0.5",
    12
            "Accept-Encoding": "gzip, deflate, br",
    13
            "Connection": "keep-alive",
    14
            "Upgrade-Insecure-Requests": "1",
    15
            "Cache-Control": "max-age=0",
    16
            "TE": "Trailers",
    17
            "DNT": "1",  # Do Not Track
    18
            "Sec-Fetch-Dest": "document",
    19
            "Sec-Fetch-Mode": "navigate",
    20
            "Sec-Fetch-Site": "none",
    21
            "Sec-Fetch-User": "?1",
    22
        }
    23
        response = requests.get(url, headers=headers)
    24
    
    
    25
        # Parse the HTML content using BeautifulSoup
    26
        soup = BeautifulSoup(response.content, "html.parser")
    27
    
    
    28
        # Find the section with class "quickfinder"
    29
        quickfinder_section = soup.find(class_="quickfinder")
    30
    
    
    31
        # If the section is found, get all 'a' tags within it
    32
        if quickfinder_section:
    33
            a_tags = quickfinder_section.find_all("a", recursive=True)
    34
            results = [
    35
                {"text": a.get_text(strip=True), "href": a.get("href")}
    36
                for a in a_tags
    37
                if a.get_text(strip=True)
    38
            ]
    39
        else:
    40
            results = []
    41
    
    
    42
        return results
    43