-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathreadability.py
89 lines (74 loc) · 2.88 KB
/
readability.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from collections import Counter
from typing import Tuple
import nlpnet
from nltk import RegexpTokenizer, tokenize
from tqdm import tqdm
from build_dataset import Corpus
from syllables import Silabizer
class Readability:
"""
Calculates some readability statistics about the corpus
"""
def __init__(self, essay):
self.essay = essay
self.tokenizer = RegexpTokenizer(r'\w+')
self.syllable = Silabizer()
self.tagger = nlpnet.POSTagger('model/pos-pt/', language='pt')
def compute_metrics(self) -> Tuple[float, float, float, float]:
"""
Computes readability metrics
"""
ttr, flesch, hapax, l_dens = 0, 0, 0, 0
total = len(self.essay['essay'])
for paragraphs in tqdm(self.essay['essay']):
words, tokens, sentences = [], [], []
for paragraph in paragraphs:
sentences.extend(tokenize.sent_tokenize(paragraph, language='portuguese'))
for sentence in sentences:
tokens.extend(tokenize.word_tokenize(sentence, language='portuguese'))
words.extend(self.tokenizer.tokenize(sentence))
ttr += self.lexical_diversity(words)
flesch += self.flesh_score(words, sentences)
hapax += self.hapax_legomenon(tokens)
l_dens += self.lexical_density(sentences, words)
return ttr / total, flesch / total, hapax / total, l_dens / total
@staticmethod
def lexical_diversity(words: list) -> float:
"""
Computes lexical diversity
"""
return len(set(words)) / len(words)
def lexical_density(self, sentences: list, words: list) -> float:
"""
Computes lexical density
"""
open_class = []
for sentence in sentences:
for text, tag in self.tagger.tag(sentence)[0]:
if tag == 'N' or tag == 'ADJ' or tag == 'ADV' or tag == 'IN' or tag == 'V' or tag == 'NPROP':
open_class.append(text)
return len(open_class) / len(words)
def flesh_score(self, words: list, sentences: list) -> float:
"""
Computes flesch score
"""
syllables = cont_syllables(words, self.syllable)
return 206.835 - 1.015 * (len(words) / len(sentences)) - 84.6 * (syllables / len(words))
@staticmethod
def hapax_legomenon(tokens: list) -> float:
"""
Computes hapax legomenon
"""
count = Counter(tokens)
hapax = sum(value == 1 for value in count.values())
return hapax
# return 100 * log(len(tokens)) / (1 - (hapax / len(list(set(tokens)))))
def cont_syllables(words: list, syllable: Silabizer) -> int:
"""
Counts syllables
"""
return len([syllable(word) for word in words])
if __name__ == '__main__':
essay = Corpus().read_corpus()
r = Readability(essay).compute_metrics()
print(r)