-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathstatistics.py
127 lines (115 loc) · 5.32 KB
/
statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from collections import Counter
from typing import Tuple, List
import numpy as np
import seaborn as sns
import spacy
from matplotlib import pyplot as plt
from nltk import tokenize
from pandas import DataFrame
from tqdm import tqdm
from build_dataset import Corpus
class Statistic:
"""
Calculates some statistics about the corpus
"""
def __init__(self, split):
self.essay = split
self.nlp = spacy.load('pt_core_news_sm')
def statistics_essays(self) -> List[Tuple]:
"""
Computes some statistics about the corpus
:return: average of paragraphs per essay, average of sentences per paragraph, average of sentences per essay
standard deviation of sentences by paragraph, standard deviation of sentences by essay,
standard deviation of paragraphs by essay, average of tokens per sentences,
standard deviation of tokens per sentence, average of tokens per paragraphs,
standard deviation of tokens per paragraph, average of tokens per essays, standard deviation of tokens per essay
"""
total_essays = len(self.essay['essay'])
total_paragraphs, total_sentences = 0, 0
total_tok_ess, total_tok_par, total_tok_snt = 0, 0, 0
std_para_by_essay, std_snt_per_para, std_snt_per_essay = [], [], []
std_tok_per_essay, std_tok_per_para, std_tok_per_snt = [], [], []
for paragraphs in self.essay['essay']:
total_paragraphs += len(paragraphs)
std_para_by_essay.append(len(paragraphs))
aux_snt, aux_tok = 0, 0
for paragraph in paragraphs:
sentences = tokenize.sent_tokenize(paragraph, language='portuguese')
total_sentences += len(sentences)
std_snt_per_para.append(len(sentences))
aux_snt += len(sentences)
tokens_para = tokenize.word_tokenize(paragraph, language='portuguese')
total_tok_par += len(tokens_para)
std_tok_per_para.append(len(tokens_para))
aux_tok += len(tokens_para)
for snt in sentences:
tokens = tokenize.word_tokenize(snt, language='portuguese')
total_tok_snt += len(tokens)
std_tok_per_snt.append(len(tokens))
std_snt_per_essay.append(aux_snt)
std_tok_per_essay.append(aux_tok)
return [('para_per_essay', total_paragraphs / total_essays),
('snt_per_para', total_sentences / total_paragraphs), ('snt_per_essay', total_sentences / total_essays),
('std_snt_per_para', np.std(std_snt_per_para)), ('std_snt_per_essay', np.std(std_snt_per_essay)),
('std_para_per_essay', np.std(std_para_by_essay)), ('tok_per_snt', total_tok_snt / total_sentences),
('std_tok_per_snt', np.std(std_tok_per_snt)), ('tok_per_para', total_tok_par / total_paragraphs),
('std_tok_per_para', np.std(std_tok_per_para)), ('tok_per_essay', total_tok_par / total_essays),
('std_tok_per_essay',np.std(std_tok_per_essay))]
def forms_of_voice(self) -> Tuple[int, int]:
"""
Computes forms of voice
"""
active_voice, passive_voice = 0, 0
for essay in tqdm(self.essay['essay']):
for paragraph in essay:
sentences = tokenize.sent_tokenize(paragraph, language='portuguese')
for sentence in sentences:
doc = self.nlp(sentence)
for snt in doc.sents:
for child in snt.root.children:
if child.dep_ != 'nsubj':
passive_voice += 1
else:
active_voice += 1
break
return active_voice, passive_voice
def competence_score(self) -> List[Counter]:
"""
Gets score for each competence
:return: Competencies scores
"""
competencies = self.essay['competence']
c1, c2, c3, c4, c5 = [], [], [], [], []
for c in competencies:
c1.append(c[0])
c2.append(c[1])
c3.append(c[2])
c4.append(c[3])
c5.append(c[4])
return [Counter(c1), Counter(c2), Counter(c3), Counter(c4), Counter(c5)]
def statistics_score(self) -> DataFrame:
"""
Gets total score of essays
:return: total scores ordered
"""
return self.essay['score'].value_counts(sort=False)
def plot_score(self, top: int) -> None:
"""
Plots the top X scores of the corpus
:param top: number of scores to be ploted
:return:
"""
total_scores = self.essay['score'].value_counts()
top_scores = total_scores[:top, ]
plot = sns.barplot(top_scores.index, top_scores.values)
for p in plot.patches:
plot.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center',
va='center', xytext=(0, 10), textcoords='offset points')
plt.show()
if __name__ == '__main__':
train, dev, test = Corpus().read_splits()
statistic = Statistic(train)
# print(statistic.statistics_score())
# statistic.competence_score()
# print(statistic.statistics_essays())
statistic.plot_score(10)