1 | from io import BytesIO |
2 | import requests |
3 | from pyvi import ViTokenizer, ViPosTagger |
4 | import string |
5 | import joblib |
6 | import pandas as pd |
7 |
|
8 |
|
9 | def create_tokens(doc) -> list: |
10 | doc = ViTokenizer.tokenize(doc) |
11 | doc = doc.lower() |
12 | tokens = doc.split() |
13 | table = str.maketrans("", "", string.punctuation.replace("_", "")) |
14 | tokens = [w.translate(table) for w in tokens] |
15 | tokens = [word for word in tokens if word] |
16 | stopwords = get_tokens_from_url( |
17 | "https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/stopwords.txt" |
18 | ) |
19 | tokens = [word for word in tokens if word not in stopwords] |
20 | return tokens |
21 |
|
22 |
|
23 | def get_tokens_from_url(url): |
24 | response = requests.get(url) |
25 | if response.status_code != 200: |
26 | return None |
27 | return set(response.text.splitlines()) |
28 |
|
29 |
|
30 | def count_tokens_in_url(url, tokens): |
31 | keywords = get_tokens_from_url(url) |
32 |
|
33 | count = 0 |
34 | for token in tokens: |
35 | if token in keywords: |
36 | count += 1 |
37 |
|
38 | return count |
39 |
|
40 |
|
41 | def count_tokens(check_tokens, tokens): |
42 | count = 0 |
43 | for token in tokens: |
44 | if token in check_tokens: |
45 | count += 1 |
46 | return count |
47 |
|
48 |
|
49 | def count_partial_tokens(check_tokens, tokens): |
50 | count = 0 |
51 | for token in tokens: |
52 | for check_token in check_tokens: |
53 | if check_token in token: |
54 | count += 1 |
55 | break # Stop checking other check_tokens for this token |
56 | return count |
57 |
|
58 |
|
59 | def first_appearance_index(test_token_list, context): |
60 | for idx, test_token in enumerate(test_token_list): |
61 | count = count_tokens(test_token, context) |
62 | if count > 0: |
63 | return idx+1 |
64 | return 0 |
65 |
|
66 |
|
67 | def appearance_array(test_token_list, context): |
68 | appearance = [] |
69 | for test_token in test_token_list: |
70 | count = count_tokens(test_token, context) |
71 | appearance.append(count) |
72 | return appearance |
73 |
|
74 |
|
75 | def remove_tokens(tokens, check_tokens): |
76 | for token in tokens: |
77 | if token in check_tokens: |
78 | tokens.remove(token) |
79 | return tokens |
80 |
|
81 | def create_input(article): |
82 |
|
83 | # Lấy token từ url tương ứng trên repo github |
84 | people_death_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/people_death_keywords.txt') |
85 | people_injuries_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/people_injuries_keywords.txt') |
86 | large_property_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/property_large_keywords.txt') |
87 | average_property_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/property_medium_keywords.txt') |
88 | small_property_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/property_small_keywords.txt') |
89 | serious_keyword_tokens = get_tokens_from_url('https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/sample/serious_keywords.txt') |
90 |
|
91 | # Khởi tạo dữ liệu ban đầu |
92 | data = { |
93 | 'deaths': 0, |
94 | 'injuries': 0, |
95 | 'property': [], |
96 | 'keyword': 0 |
97 | } |
98 | |
99 | tokens = create_tokens(article) |
100 |
|
101 | # Xử lý số |
102 | number_conversion = { |
103 | "trăm": 100, |
104 | "ngàn": 1000, |
105 | "nghìn": 1000, |
106 | "vạn": 10000, |
107 | "triệu": 1000000, |
108 | "tỷ": 1000000000, |
109 | "trăm_ngàn": 100000, |
110 | "chục_ngàn": 10000, |
111 | "chục": 10, |
112 | "mươi": 10, |
113 | "nhiều": 2 |
114 | } |
115 |
|
116 | for idx, token in enumerate(tokens): |
117 | if token in number_conversion: |
118 | tokens[idx] = str(number_conversion[token]) |
119 |
|
120 |
|
121 | people_token_list = [people_death_tokens, people_injuries_tokens] |
122 | property_token_list = [large_property_tokens, average_property_tokens, small_property_tokens] |
123 |
|
124 | for token in tokens: |
125 |
|
126 | if token.isdigit(): |
127 | left_context = tokens[max(0, tokens.index(token) - 2):tokens.index(token)] |
128 | check_value = first_appearance_index(people_token_list, left_context) |
129 | if (check_value != 0): |
130 | if check_value == 1: |
131 | data['deaths'] += int(token) |
132 | else: |
133 | data['injuries'] += int(token) |
134 | remove_tokens(tokens, left_context) |
135 | continue |
136 |
|
137 |
|
138 | right_context = tokens[tokens.index(token) + 1:tokens.index(token) + 2] |
139 | check_value = first_appearance_index(people_token_list, right_context) |
140 | if (check_value != 0): |
141 | if check_value == 1: |
142 | data['deaths'] += int(token) |
143 | else: |
144 | data['injuries'] += int(token) |
145 | remove_tokens(tokens, right_context) |
146 | continue |
147 |
|
148 |
|
149 |
|
150 | check_value = appearance_array(property_token_list, tokens) |
151 | data['property'] = check_value |
152 |
|
153 | filtered_tokens = [word for word in tokens if len(word) > 2] |
154 | length = len(filtered_tokens) |
155 | data['keyword'] = count_tokens(serious_keyword_tokens, filtered_tokens)/length |
156 |
|
157 | return data |
158 |
|
159 |
|
160 |
|
161 | def main(article: str): |
162 | |
163 | data = create_input(article) |
164 |
|
165 | joblib_url = "https://raw.githubusercontent.com/CTUbase/windmill-plugins/main/model_2/models/best_decision_tree_model.joblib" |
166 | response = requests.get(joblib_url) |
167 |
|
168 | model = joblib.load(BytesIO(response.content)) |
169 | # Convert the data dictionary to a DataFrame with appropriate feature names |
170 | data_df = pd.DataFrame( |
171 | [ |
172 | { |
173 | "death": data["deaths"], |
174 | "injuries": data["injuries"], |
175 | "property_large": data["property"][0], |
176 | "property_medium": data["property"][1], |
177 | "property_small": data["property"][2], |
178 | "keyword": data["keyword"], |
179 | } |
180 | ] |
181 | ) |
182 |
|
183 | # Make the prediction |
184 | prediction = model.predict(data_df) |
185 | return prediction[0] |
186 |
|