{"flow":{"id":57,"summary":"Flow suggests disaster locations","versions":[209,210,211],"created_by":"supermoikeroise41346271","created_at":"2024-12-07T10:10:55.955Z","votes":1,"approved":false,"apps":[],"value":{"modules":[{"id":"a","value":{"lock":"beautifulsoup4==4.12.3\nbs4==0.0.2\ncertifi==2024.8.30\ncharset-normalizer==3.4.0\nidna==3.10\njoblib==1.4.2\nrequests==2.32.3\nsoupsieve==2.6\nurllib3==2.2.3","type":"rawscript","content":"from joblib import Parallel, delayed\nimport requests\nfrom bs4 import BeautifulSoup\n\ndef main(keyword: str, maximum:int = 5):\n    def fetch_page(page):\n        find_url = f\"https://timkiem.vnexpress.net/?q={keyword}&media_type=all&fromdate=0&todate=0&latest=&cate_code=&search_f=title,tag_list&date_format=all&page={page}\"\n        response = requests.get(find_url)\n        response.raise_for_status()\n\n        soup = BeautifulSoup(response.text, \"html.parser\")\n\n        articles = soup.find_all(\"article\", class_=\"item-news\")\n        return [article.get(\"data-url\") for article in articles if article.get(\"data-url\")]\n\n    results = Parallel(n_jobs=-1)(delayed(fetch_page)(i) for i in range(1, maximum + 1))\n    urls = [url for sublist in results for url in sublist]\n    print(\"Finished fetching URLs\")\n    return urls","language":"python3","is_trigger":false,"input_transforms":{"keyword":{"expr":"flow_input.keyword","type":"javascript"},"maximum":{"expr":"flow_input.maximum","type":"javascript"}}},"summary":"Get url"},{"id":"b","value":{"type":"forloopflow","modules":[{"id":"c","value":{"lock":"beautifulsoup4==4.12.3\nbs4==0.0.2\ncertifi==2024.8.30\ncharset-normalizer==3.4.0\nidna==3.10\nrequests==2.32.3\nsoupsieve==2.6\nurllib3==2.2.3","type":"rawscript","content":"import requests\nfrom bs4 import BeautifulSoup\n\n\ndef main(url):\n    # Gửi request đến trang web\n    response = requests.get(url)\n    response.raise_for_status()  # Kiểm tra nếu request thành công\n\n    # Phân tích nội dung HTML\n    soup = BeautifulSoup(response.text, \"html.parser\")\n\n    # Tìm tất cả các thẻ h1 có class=\"title-detail\"\n    title_of_article = soup.find(\"h1\", class_=\"title-detail\")\n    if title_of_article is None:\n        return None\n    title_of_article = title_of_article.text\n\n    description_of_article = soup.find(\"p\", class_=\"description\").text\n\n    # Tìm thẻ article có class=\"fck_detail\"\n    article = soup.find(\"article\", class_=\"fck_detail\")\n\n    # Lấy nội dung các thẻ p bên trong thẻ article\n    paragraphs = article.find_all(\"p\")\n    detail_of_article = [p.text for p in paragraphs]\n    detail_of_article.pop().strip()\n\n    detail_of_article = \" \".join(detail_of_article)\n\n    return {\n        \"title\": title_of_article,\n        \"description\": description_of_article,\n        \"content\": detail_of_article,\n    }\n","language":"python3","is_trigger":false,"input_transforms":{"url":{"expr":"flow_input.iter.value","type":"javascript"}}},"summary":"Crawl data from url"},{"id":"d","value":{"lock":"certifi==2024.8.30\ncharset-normalizer==3.4.0\nidna==3.10\njoblib==1.4.2\nnumpy==2.1.3\npython-crfsuite==0.9.11\npyvi==0.1.1\nrequests==2.32.3\nscikit-learn==1.5.2\nscipy==1.14.1\nsklearn-crfsuite==0.5.0\ntabulate==0.9.0\nthreadpoolctl==3.5.0\ntqdm==4.67.1\nurllib3==2.2.3","type":"rawscript","content":"import requests\nfrom pyvi import ViTokenizer\nimport string\n\n\ndef get_tokens_from_url(url):\n    response = requests.get(url)\n    if response.status_code != 200:\n        return None\n\n    return set(response.text.splitlines())\n\n\ndef count_tokens_in_url(url, tokens):\n    keywords = get_tokens_from_url(url)\n\n    count = 0\n    for token in tokens:\n        if token in keywords:\n            count += 1\n\n    return count\n\n\ndef create_tokens(doc) -> list:\n    doc = ViTokenizer.tokenize(doc)\n    doc = doc.lower()\n    tokens = doc.split()\n    table = str.maketrans(\"\", \"\", string.punctuation.replace(\"_\", \"\"))\n    tokens = [w.translate(table) for w in tokens]\n    tokens = [word for word in tokens if word]\n    stopwords = get_tokens_from_url(\n        \"https://raw.githubusercontent.com/NguyenDoanHoangPhuc/txt_for_ai/main/stopwords.txt\"\n    )\n\n    tokens = [word for word in tokens if word not in stopwords and not word.isdigit()]\n    tokens = [word for word in tokens if len(word) > 2]\n    return tokens\n\n\ndef main(article):\n    if article is None:\n        return None\n\n    title = article[\"title\"]\n    description = article[\"description\"]\n    content = article[\"content\"]\n\n    doc = title + \" \" + description + \" \" + content\n    tokens = create_tokens(doc)\n\n    len_tokens = len(tokens)\n\n    region_count = count_tokens_in_url(\n        \"https://raw.githubusercontent.com/NguyenDoanHoangPhuc/txt_for_ai/main/region.txt\",\n        tokens,\n    )\n    baolu_count = count_tokens_in_url(\n        \"https://raw.githubusercontent.com/NguyenDoanHoangPhuc/txt_for_ai/main/bao_lu.txt\",\n        tokens,\n    )\n    dichbenh_count = count_tokens_in_url(\n        \"https://raw.githubusercontent.com/NguyenDoanHoangPhuc/txt_for_ai/main/dich_benh.txt\",\n        tokens,\n    )\n\n    region_ratio = region_count / len_tokens\n    baolu_ratio = baolu_count / len_tokens\n    dichbenh_ratio = dichbenh_count / len_tokens\n\n    doc = title + \" \" + description + \" \" + content\n    tokens = create_tokens(doc)\n   \n    region_keywords = get_tokens_from_url(\"https://raw.githubusercontent.com/NguyenDoanHoangPhuc/txt_for_ai/main/region.txt\")\n\n    found_locations = list(\n        set(token for token in tokens if token in region_keywords and \"_\" in token)\n    )\n\n    return {\n        \"article\": article,\n        \"ratio\": [region_ratio, baolu_ratio, dichbenh_ratio],\n        \"locations\": found_locations,\n    }\n","language":"python3","is_trigger":false,"input_transforms":{"article":{"expr":"results.c","type":"javascript"}}},"summary":"Turn the data to the model input"},{"id":"e","value":{"lock":"","type":"rawscript","content":"def main(data, url):\n    ratios = data[\"ratio\"]\n    locations = data['locations']\n\n    weight = [0.12054469, 0.05292558, 0.01311273]\n    bias = -0.019999999999999997\n\n    calc = ratios[0] * weight[0] + ratios[1] * weight[1] + ratios[2] * weight[2] + bias\n    prediction = 1 if calc >= 0 else 0\n\n    if prediction == 1:\n        result = [\n            (url, location)\n            for location in locations\n        ]\n        return result\n    else:\n        return None\n","language":"python3","is_trigger":false,"input_transforms":{"url":{"expr":"flow_input.iter.value","type":"javascript"},"data":{"expr":"results.d","type":"javascript"}}},"summary":"Use model to check"}],"iterator":{"expr":"results.a","type":"javascript"},"parallel":true,"skip_failures":true}},{"id":"f","value":{"lock":"","type":"rawscript","content":"def main(data):\n    # Tập hợp lưu trữ các địa danh đã gặp\n    unique_locations = set()\n    # Danh sách kết quả\n    result = []\n\n    for entry in data:\n        if isinstance(entry, list):  # Chỉ xử lý các phần tử kiểu danh sách\n            for pair in entry:\n                # Kiểm tra nếu phần tử là danh sách và địa danh chưa xuất hiện\n                if isinstance(pair, list) and pair[1] not in unique_locations:\n                    unique_locations.add(pair[1])  # Đánh dấu địa danh đã gặp\n                    result.append(pair)  # Thêm cặp (url, địa danh) vào kết quả\n\n    return result\n","language":"python3","is_trigger":false,"input_transforms":{"data":{"expr":"results.b","type":"javascript"}}},"summary":"Clean data"}]},"schema":{"type":"object","order":["keyword","maximum"],"$schema":"https://json-schema.org/draft/2020-12/schema","required":[],"properties":{"keyword":{"type":"string","default":"","description":""},"maximum":{"type":"integer"}}},"description":"","recording":null,"vcreated_at":"2024-12-07T15:59:21.234Z","vcreated_by":"supermoikeroise41346271","comments":[{"id":54,"content":"Hi, I am the creator of this flow.\nThis flow has the function of crawling data from the Vietnamese website VNExpress to get information about articles related to natural disasters such as storms, or epidemics. From there, it can retrieve locations such as provinces in Vietnam to help us know where is in danger\nYou can see the details again on the github link:\nhttps://github.com/CTUbase/windmill-plugins/tree/main/model_2","created_by":"supermoikeroise41346271","created_at":"2024-12-07T16:02:47.357Z","votes":0}]}}