Predicting the extent of natural disasters

The script has the function of describing the specific level of the natural disaster (on a scale of 0 to 3) depending on the description entered. If the description includes details about the number of deaths, injuries and property damage, the model will work best. Currently the supported language is Vietnamese. You can access the script details via github: https://github.com/CTUbase/windmill-plugins/tree/main/model_1

Script github

by supermoikeroise41346271 · 12/7/2024

  • Submitted by supermoikeroise41346271 Python3
    Created 516 days ago
    1
    from io import BytesIO
    2
    import requests
    3
    from pyvi import ViTokenizer, ViPosTagger
    4
    import string
    5
    import joblib
    6
    import pandas as pd
    7
    
    
    8
    
    
    9
    def create_tokens(doc) -> list:
    10
        doc = ViTokenizer.tokenize(doc)
    11
        doc = doc.lower()
    12
        tokens = doc.split()
    13
        table = str.maketrans("", "", string.punctuation.replace("_", ""))
    14
        tokens = [w.translate(table) for w in tokens]
    15
        tokens = [word for word in tokens if word]
    16
        stopwords = get_tokens_from_url(
    17
            "https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/stopwords.txt"
    18
        )
    19
        tokens = [word for word in tokens if word not in stopwords]
    20
        return tokens
    21
    
    
    22
    
    
    23
    def get_tokens_from_url(url):
    24
        response = requests.get(url)
    25
        if response.status_code != 200:
    26
            return None
    27
        return set(response.text.splitlines())
    28
    
    
    29
    
    
    30
    def count_tokens_in_url(url, tokens):
    31
        keywords = get_tokens_from_url(url)
    32
    
    
    33
        count = 0
    34
        for token in tokens:
    35
            if token in keywords:
    36
                count += 1
    37
    
    
    38
        return count
    39
    
    
    40
    
    
    41
    def count_tokens(check_tokens, tokens):
    42
        count = 0
    43
        for token in tokens:
    44
            if token in check_tokens:
    45
                count += 1
    46
        return count
    47
    
    
    48
    
    
    49
    def count_partial_tokens(check_tokens, tokens):
    50
        count = 0
    51
        for token in tokens:
    52
            for check_token in check_tokens:
    53
                if check_token in token:
    54
                    count += 1
    55
                    break  # Stop checking other check_tokens for this token
    56
        return count
    57
    
    
    58
    
    
    59
    def first_appearance_index(test_token_list, context):
    60
        for idx, test_token in enumerate(test_token_list):
    61
            count = count_tokens(test_token, context)
    62
            if count > 0:
    63
                return idx+1
    64
        return 0
    65
    
    
    66
    
    
    67
    def appearance_array(test_token_list, context):
    68
        appearance = []
    69
        for test_token in test_token_list:
    70
            count = count_tokens(test_token, context)
    71
            appearance.append(count)
    72
        return appearance
    73
    
    
    74
    
    
    75
    def remove_tokens(tokens, check_tokens):
    76
        for token in tokens:
    77
            if token in check_tokens:
    78
                tokens.remove(token)
    79
        return tokens
    80
    
    
    81
    def create_input(article):
    82
    
    
    83
        # Lấy token từ url tương ứng trên repo github
    84
        people_death_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/people_death_keywords.txt')
    85
        people_injuries_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/people_injuries_keywords.txt')
    86
        large_property_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/property_large_keywords.txt')
    87
        average_property_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/property_medium_keywords.txt')
    88
        small_property_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/property_small_keywords.txt')
    89
        serious_keyword_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/serious_keywords.txt')
    90
    
    
    91
        # Khởi tạo dữ liệu ban đầu
    92
        data = {
    93
            'deaths': 0,
    94
            'injuries': 0,
    95
            'property': [],
    96
            'keyword': 0
    97
        }
    98
        
    99
        tokens = create_tokens(article)
    100
    
    
    101
        # Xử lý số
    102
        number_conversion = {
    103
            "trăm": 100,
    104
            "ngàn": 1000,
    105
            "nghìn": 1000,
    106
            "vạn": 10000,
    107
            "triệu": 1000000,
    108
            "tỷ": 1000000000,
    109
            "trăm_ngàn": 100000,
    110
            "chục_ngàn": 10000,
    111
            "chục": 10,
    112
            "mươi": 10,
    113
            "nhiều": 2
    114
        }
    115
    
    
    116
        for idx, token in enumerate(tokens):
    117
            if token in number_conversion:
    118
                tokens[idx] = str(number_conversion[token])
    119
    
    
    120
    
    
    121
        people_token_list = [people_death_tokens, people_injuries_tokens]
    122
        property_token_list = [large_property_tokens, average_property_tokens, small_property_tokens]
    123
    
    
    124
        for token in tokens:
    125
    
    
    126
            if token.isdigit():
    127
                left_context = tokens[max(0, tokens.index(token) - 2):tokens.index(token)]
    128
                check_value = first_appearance_index(people_token_list, left_context)
    129
                if (check_value != 0):
    130
                    if check_value == 1:
    131
                        data['deaths'] += int(token)
    132
                    else:
    133
                        data['injuries'] += int(token)
    134
                    remove_tokens(tokens, left_context)
    135
                    continue
    136
    
    
    137
    
    
    138
                right_context = tokens[tokens.index(token) + 1:tokens.index(token) + 2]
    139
                check_value = first_appearance_index(people_token_list, right_context)
    140
                if (check_value != 0):
    141
                    if check_value == 1:
    142
                        data['deaths'] += int(token)
    143
                    else:
    144
                        data['injuries'] += int(token)
    145
                    remove_tokens(tokens, right_context)
    146
                    continue
    147
    
    
    148
    
    
    149
    
    
    150
        check_value = appearance_array(property_token_list, tokens)
    151
        data['property'] = check_value
    152
    
    
    153
        filtered_tokens = [word for word in tokens if len(word) > 2]
    154
        length = len(filtered_tokens)
    155
        data['keyword'] = count_tokens(serious_keyword_tokens, filtered_tokens)/length
    156
    
    
    157
        return data
    158
    
    
    159
    
    
    160
    
    
    161
    def main(article: str):
    162
        
    163
        data = create_input(article)
    164
    
    
    165
        joblib_url = "https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/models/best_decision_tree_model.joblib"
    166
        response = requests.get(joblib_url)
    167
    
    
    168
        model = joblib.load(BytesIO(response.content))
    169
        # Convert the data dictionary to a DataFrame with appropriate feature names
    170
        data_df = pd.DataFrame(
    171
            [
    172
                {
    173
                    "death": data["deaths"],
    174
                    "injuries": data["injuries"],
    175
                    "property_large": data["property"][0],
    176
                    "property_medium": data["property"][1],
    177
                    "property_small": data["property"][2],
    178
                    "keyword": data["keyword"],
    179
                }
    180
            ]
    181
        )
    182
    
    
    183
        # Make the prediction
    184
        prediction = model.predict(data_df)
    185
        return prediction[0]
    186