-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathtext_mining.py
83 lines (62 loc) · 2.84 KB
/
text_mining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
from string import digits
import re
from pathlib import Path
class TextCleaner:
def __get_stop_words(self):
stopwords = pd.read_csv(f"{Path(Path(__file__).resolve()).parent}/stopwords.txt", names=['words'])
return stopwords
def __get_text_df(self, text):
text_df = pd.DataFrame(data=text.split(), columns=['words'])
return text_df
def __cleaning_text(self, text):
def remove_emoji(text):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
text = text.lower().strip()
# remove sp char
text = " ".join([i for i in text.split() if '_' not in i])
text = " ".join([i for i in text.split() if '@' not in i])
# remove url
text = re.sub(r"http\S+", "", text)
text = remove_emoji(text=text)
text = re.sub(r'[^\w\s]', '', text)
# remove digits
remove_digits = str.maketrans('', '', digits)
text = text.translate(remove_digits)
return text
def get_clean_text(self, text):
text = self.__cleaning_text(text)
text_df = self.__get_text_df(text)
stopwords = self.__get_stop_words()
clean_text_df = text_df[~text_df.words.isin(stopwords.words)].reset_index(drop=True)
clean_text_df = clean_text_df.words.value_counts().\
reset_index().\
rename(columns={'index': 'word', 'words': 'count'}).\
sort_values(by=['count'], ascending=False).\
reset_index(drop=True)
return clean_text_df
# insert new stopword(s) into stopwords.txt
def insert_new_word(self, new_word):
stop_words = self.__get_stop_words()
if type(new_word) == list:
new_word = [i.strip().lower() for i in new_word]
else:
new_word = [new_word.lower().strip()]
words = []
for word in new_word:
if word not in list(stop_words.words):
words.append(word)
print(f"insert_new_word: {word} has been inserted")
else:
print(f"insert_new_word: {word} already exists")
if words:
new_stopwords_df = pd.concat([stop_words, pd.DataFrame(data=words, columns=['words'])])
new_stopwords_df.to_csv('stopwords.txt', index=False, header=False)