Script 'Predicting the extent of natural disasters' for github

Predicting the extent of natural disasters

The script has the function of describing the specific level of the natural disaster (on a scale of 0 to 3) depending on the description entered. If the description includes details about the number of deaths, injuries and property damage, the model will work best. Currently the supported language is Vietnamese. You can access the script details via github: https://github.com/CTUbase/windmill-plugins/tree/main/model_1

Created by supermoikeroise41346271 196 days ago Viewed 1 times

Submitted by supermoikeroise41346271 Python3

Created 196 days ago

All edits

Permalink

from io import BytesIO
import requests
from pyvi import ViTokenizer, ViPosTagger
import string
import joblib
import pandas as pd


def create_tokens(doc) -> list:
    doc = ViTokenizer.tokenize(doc)
    doc = doc.lower()
    tokens = doc.split()
    table = str.maketrans("", "", string.punctuation.replace("_", ""))
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word]
    stopwords = get_tokens_from_url(
        "https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/stopwords.txt"
    )
    tokens = [word for word in tokens if word not in stopwords]
    return tokens


def get_tokens_from_url(url):
    response = requests.get(url)
    if response.status_code != 200:
        return None
    return set(response.text.splitlines())


def count_tokens_in_url(url, tokens):
    keywords = get_tokens_from_url(url)

    count = 0
    for token in tokens:
        if token in keywords:
            count += 1

    return count


def count_tokens(check_tokens, tokens):
    count = 0
    for token in tokens:
        if token in check_tokens:
            count += 1
    return count


def count_partial_tokens(check_tokens, tokens):
    count = 0
    for token in tokens:
        for check_token in check_tokens:
            if check_token in token:
                count += 1
                break  # Stop checking other check_tokens for this token
    return count


def first_appearance_index(test_token_list, context):
    for idx, test_token in enumerate(test_token_list):
        count = count_tokens(test_token, context)
        if count > 0:
            return idx+1
    return 0


def appearance_array(test_token_list, context):
    appearance = []
    for test_token in test_token_list:
        count = count_tokens(test_token, context)
        appearance.append(count)
    return appearance


def remove_tokens(tokens, check_tokens):
    for token in tokens:
        if token in check_tokens:
            tokens.remove(token)
    return tokens

def create_input(article):

    # Lấy token từ url tương ứng trên repo github
    people_death_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/people_death_keywords.txt')
    people_injuries_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/people_injuries_keywords.txt')
    large_property_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/property_large_keywords.txt')
    average_property_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/property_medium_keywords.txt')
    small_property_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/property_small_keywords.txt')
    serious_keyword_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/serious_keywords.txt')

    # Khởi tạo dữ liệu ban đầu
    data = {
        'deaths': 0,
        'injuries': 0,
        'property': [],
        'keyword': 0
    }
    
    tokens = create_tokens(article)

    # Xử lý số
    number_conversion = {
        "trăm": 100,
        "ngàn": 1000,
        "nghìn": 1000,
        "vạn": 10000,
        "triệu": 1000000,
        "tỷ": 1000000000,
        "trăm_ngàn": 100000,
        "chục_ngàn": 10000,
        "chục": 10,
        "mươi": 10,
        "nhiều": 2
    }

    for idx, token in enumerate(tokens):
        if token in number_conversion:
            tokens[idx] = str(number_conversion[token])


    people_token_list = [people_death_tokens, people_injuries_tokens]
    property_token_list = [large_property_tokens, average_property_tokens, small_property_tokens]

    for token in tokens:

        if token.isdigit():
            left_context = tokens[max(0, tokens.index(token) - 2):tokens.index(token)]
            check_value = first_appearance_index(people_token_list, left_context)
            if (check_value != 0):
                if check_value == 1:
                    data['deaths'] += int(token)
                else:
                    data['injuries'] += int(token)
                remove_tokens(tokens, left_context)
                continue


            right_context = tokens[tokens.index(token) + 1:tokens.index(token) + 2]
            check_value = first_appearance_index(people_token_list, right_context)
            if (check_value != 0):
                if check_value == 1:
                    data['deaths'] += int(token)
                else:
                    data['injuries'] += int(token)
                remove_tokens(tokens, right_context)
                continue



    check_value = appearance_array(property_token_list, tokens)
    data['property'] = check_value

    filtered_tokens = [word for word in tokens if len(word) > 2]
    length = len(filtered_tokens)
    data['keyword'] = count_tokens(serious_keyword_tokens, filtered_tokens)/length

    return data



def main(article: str):
    
    data = create_input(article)

    joblib_url = "https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/models/best_decision_tree_model.joblib"
    response = requests.get(joblib_url)

    model = joblib.load(BytesIO(response.content))
    # Convert the data dictionary to a DataFrame with appropriate feature names
    data_df = pd.DataFrame(
        [
            {
                "death": data["deaths"],
                "injuries": data["injuries"],
                "property_large": data["property"][0],
                "property_medium": data["property"][1],
                "property_small": data["property"][2],
                "keyword": data["keyword"],
            }
        ]
    )

    # Make the prediction
    prediction = model.predict(data_df)
    return prediction[0]


`1`	`from io import BytesIO`
`2`	`import requests`
`3`	`from pyvi import ViTokenizer, ViPosTagger`
`4`	`import string`
`5`	`import joblib`
`6`	`import pandas as pd`
`7`
`8`
`9`	`def create_tokens(doc) -> list:`
`10`	`doc = ViTokenizer.tokenize(doc)`
`11`	`doc = doc.lower()`
`12`	`tokens = doc.split()`
`13`	`table = str.maketrans("", "", string.punctuation.replace("_", ""))`
`14`	`tokens = [w.translate(table) for w in tokens]`
`15`	`tokens = [word for word in tokens if word]`
`16`	`stopwords = get_tokens_from_url(`
`17`	`"https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/stopwords.txt"`
`18`	`)`
`19`	`tokens = [word for word in tokens if word not in stopwords]`
`20`	`return tokens`
`21`
`22`
`23`	`def get_tokens_from_url(url):`
`24`	`response = requests.get(url)`
`25`	`if response.status_code != 200:`
`26`	`return None`
`27`	`return set(response.text.splitlines())`
`28`
`29`
`30`	`def count_tokens_in_url(url, tokens):`
`31`	`keywords = get_tokens_from_url(url)`
`32`
`33`	`count = 0`
`34`	`for token in tokens:`
`35`	`if token in keywords:`
`36`	`count += 1`
`37`
`38`	`return count`
`39`
`40`
`41`	`def count_tokens(check_tokens, tokens):`
`42`	`count = 0`
`43`	`for token in tokens:`
`44`	`if token in check_tokens:`
`45`	`count += 1`
`46`	`return count`
`47`
`48`
`49`	`def count_partial_tokens(check_tokens, tokens):`
`50`	`count = 0`
`51`	`for token in tokens:`
`52`	`for check_token in check_tokens:`
`53`	`if check_token in token:`
`54`	`count += 1`
`55`	`break # Stop checking other check_tokens for this token`
`56`	`return count`
`57`
`58`
`59`	`def first_appearance_index(test_token_list, context):`
`60`	`for idx, test_token in enumerate(test_token_list):`
`61`	`count = count_tokens(test_token, context)`
`62`	`if count > 0:`
`63`	`return idx+1`
`64`	`return 0`
`65`
`66`
`67`	`def appearance_array(test_token_list, context):`
`68`	`appearance = []`
`69`	`for test_token in test_token_list:`
`70`	`count = count_tokens(test_token, context)`
`71`	`appearance.append(count)`
`72`	`return appearance`
`73`
`74`
`75`	`def remove_tokens(tokens, check_tokens):`
`76`	`for token in tokens:`
`77`	`if token in check_tokens:`
`78`	`tokens.remove(token)`
`79`	`return tokens`
`80`
`81`	`def create_input(article):`
`82`
`83`	`# Lấy token từ url tương ứng trên repo github`
`84`	`people_death_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/people_death_keywords.txt')`
`85`	`people_injuries_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/people_injuries_keywords.txt')`
`86`	`large_property_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/property_large_keywords.txt')`
`87`	`average_property_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/property_medium_keywords.txt')`
`88`	`small_property_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/property_small_keywords.txt')`
`89`	`serious_keyword_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/serious_keywords.txt')`
`90`
`91`	`# Khởi tạo dữ liệu ban đầu`
`92`	`data = {`
`93`	`'deaths': 0,`
`94`	`'injuries': 0,`
`95`	`'property': [],`
`96`	`'keyword': 0`
`97`	`}`
`98`
`99`	`tokens = create_tokens(article)`
`100`
`101`	`# Xử lý số`
`102`	`number_conversion = {`
`103`	`"trăm": 100,`
`104`	`"ngàn": 1000,`
`105`	`"nghìn": 1000,`
`106`	`"vạn": 10000,`
`107`	`"triệu": 1000000,`
`108`	`"tỷ": 1000000000,`
`109`	`"trăm_ngàn": 100000,`
`110`	`"chục_ngàn": 10000,`
`111`	`"chục": 10,`
`112`	`"mươi": 10,`
`113`	`"nhiều": 2`
`114`	`}`
`115`
`116`	`for idx, token in enumerate(tokens):`
`117`	`if token in number_conversion:`
`118`	`tokens[idx] = str(number_conversion[token])`
`119`
`120`
`121`	`people_token_list = [people_death_tokens, people_injuries_tokens]`
`122`	`property_token_list = [large_property_tokens, average_property_tokens, small_property_tokens]`
`123`
`124`	`for token in tokens:`
`125`
`126`	`if token.isdigit():`
`127`	`left_context = tokens[max(0, tokens.index(token) - 2):tokens.index(token)]`
`128`	`check_value = first_appearance_index(people_token_list, left_context)`
`129`	`if (check_value != 0):`
`130`	`if check_value == 1:`
`131`	`data['deaths'] += int(token)`
`132`	`else:`
`133`	`data['injuries'] += int(token)`
`134`	`remove_tokens(tokens, left_context)`
`135`	`continue`
`136`
`137`
`138`	`right_context = tokens[tokens.index(token) + 1:tokens.index(token) + 2]`
`139`	`check_value = first_appearance_index(people_token_list, right_context)`
`140`	`if (check_value != 0):`
`141`	`if check_value == 1:`
`142`	`data['deaths'] += int(token)`
`143`	`else:`
`144`	`data['injuries'] += int(token)`
`145`	`remove_tokens(tokens, right_context)`
`146`	`continue`
`147`
`148`
`149`
`150`	`check_value = appearance_array(property_token_list, tokens)`
`151`	`data['property'] = check_value`
`152`
`153`	`filtered_tokens = [word for word in tokens if len(word) > 2]`
`154`	`length = len(filtered_tokens)`
`155`	`data['keyword'] = count_tokens(serious_keyword_tokens, filtered_tokens)/length`
`156`
`157`	`return data`
`158`
`159`
`160`
`161`	`def main(article: str):`
`162`
`163`	`data = create_input(article)`
`164`
`165`	`joblib_url = "https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/models/best_decision_tree_model.joblib"`
`166`	`response = requests.get(joblib_url)`
`167`
`168`	`model = joblib.load(BytesIO(response.content))`
`169`	`# Convert the data dictionary to a DataFrame with appropriate feature names`
`170`	`data_df = pd.DataFrame(`
`171`	`[`
`172`	`{`
`173`	`"death": data["deaths"],`
`174`	`"injuries": data["injuries"],`
`175`	`"property_large": data["property"][0],`
`176`	`"property_medium": data["property"][1],`
`177`	`"property_small": data["property"][2],`
`178`	`"keyword": data["keyword"],`
`179`	`}`
`180`	`]`
`181`	`)`
`182`
`183`	`# Make the prediction`
`184`	`prediction = model.predict(data_df)`
`185`	`return prediction[0]`
`186`