-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfst_wrapper.py
171 lines (151 loc) · 6.88 KB
/
fst_wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function
import re
import pexpect
import config
class FstWrapper():
def __init__(self):
if config.debug_lvl > 0: print("try to execute following command:\n'" + config.fst_string + "'")
self.child = pexpect.spawnu(config.fst_string)
self.child.delaybeforesend = 0
self.child.expect(["analyze> ", pexpect.EOF])
self.morAnalyseMode = True
# regex for stem guessing NOTE: for now only used for adjectives
self.regex_adj_stem = re.compile("^(.*?)<") # TODO: move to regex file
before = self.child.before
if config.debug_lvl > 0: print(before)
if self.child.terminated:
raise RuntimeError(before)
def analyse(self, word):
word = word.strip()
if word == "":
return []
# if not in analyse mode, go to it
if self.morAnalyseMode == False:
# print "Was not in analyse mode => toggle to it!"
self.toggleMorMode()
self.child.sendline("") # "" is used in the fst-mor to toggle between analyse/generate
self.child.expect(["analyze> ", pexpect.EOF])
self.child.before
self.child.sendline(word)
self.child.expect(["analyze> ", pexpect.EOF])
result = self.child.before.split("\r\n")[1:-1]
if len(result) == 1 and re.match("^no result for ", result[0]):
result = []
return result
def generate(self, word):
word = word.strip()
if word == "":
return []
# if not in analyse mode, go to it
if self.morAnalyseMode == True:
# print "Was not in generate mode => toggle to it!"
self.toggleMorMode()
self.child.sendline("") # "" is used in the fst-mor to toggle between analyse/generate
self.child.expect(["generate> ", pexpect.EOF])
self.child.before
self.child.sendline(word)
self.child.expect(["generate> ", pexpect.EOF])
result = self.child.before.split("\r\n")[1:-1]
if len(result) == 1 and re.match("^no result for ", result[0]):
result = []
return result
# if you just want to play around you can use this function
def openShell(self):
while True:
if config.debug_lvl > 0: print("################################\n", self.child.before, "############################\n")
input_string = input("input<<<<")
if config.debug_lvl > 0: print("Sending an input to the prog:", input_string)
if input_string == "":
if config.debug_lvl > 0: print("input string was '\\nn' => toggle to Mode")
self.toggleMorMode()
self.child.sendline(input_string)
if self.morAnalyseMode == True:
if config.debug_lvl > 0: print("### in analyse mode")
self.child.expect(["analyze> ", pexpect.EOF])
else:
if config.debug_lvl > 0: print("### in generate mode")
self.child.expect(["generate> ", pexpect.EOF])
def toggleMorMode(self):
self.morAnalyseMode = not self.morAnalyseMode
# this function will return the possible analysis
# @analysis: must be a list of possible analysis
# @filterStrings: a list of not regex strings
def filterAnalysis(self, analysis, filterStrings, wordstem, pos):
filteredAnalysis = []
# compile all the regexes
filterRegexes = []
for filterString in filterStrings:
filterRegexes.append(re.compile(".*" + re.escape(filterString)+ ".*" ))
if analysis == None:
return []
for ana in analysis:
if ana.split('<')[0] == wordstem or (ana.startswith('<ge>') and ana[4:].split('<')[0] == wordstem):
possibleAna = True
for filterRegex in filterRegexes:
match = filterRegex.match(ana)
if match == None:
possibleAna = False
break
else:
possibleAna = False
# NOTE: following are hardcoded special rules
if pos == 'ADJ':
try:
match = self.regex_adj_stem.match(ana).group(1)
if match == wordstem:
pass
else:
possibleAna = False
# print("Special filtering for Adjective: filtered => " + ana)
except Exception as e:
print(e)
possibleAna = False
if possibleAna == True: # if it is still true, then add it
filteredAnalysis.append(ana)
return filteredAnalysis
# find all fst symbols described in 'symbols.fst' # TODO: add link/ref to page/file
# @analysis: ONE analysis
def findSymbols(self, analysis):
symbols = []
r = re.compile("<(.*?)>");
for match in r.finditer(analysis):
symbols.append(match.group(1))
return symbols
# this function returns the inflectional class of the given analysis
# TODO: write many tests for this thing!
# returns inflectional class or None
# @analysis: ONE analysis
def determineInflClass(self, analysis):
# first find all symbols
symbols = self.findSymbols(analysis)
if len(symbols) == 0:
if config.debug_lvl > 0: print("no inflectional class could be found (even no symbols)")
return None # no symbols => no infl class
# look for inflectional class(es) symbol NOTE: only ONE inflectional class should be found
# TODO: do regex compilation at the beginning of script/class
# NOTE: these classes will not be regex-escaped! so be careful
nouns = ["NMasc","NFem","NNeut","N\?","NTrunc"]
noms = ["NSMasc", "NSFem", "NSNeut", "^NS\-er$"] # TODO: NS... are 'nom-classes'. should they be added?
names = ["^Name\-", "^FamName_"]
numbers = ["^Card", "^DigOrd$", "^Ord$", "^NumAdjFlex$"]
verbs = ["^VA","^VI", "^VM", "^VP", "^VV"]
adjs = ["^Adj"]
abks = ["^Abk_"] # abbriviations
# the messy others # TODO ...
others = []
allInfclasses = nouns + names + numbers + verbs + adjs + abks + others # TODO: add 'noms' ?
r = re.compile("|".join(allInfclasses)) # OR them all
inflClasses = []
for sym in symbols:
if r.match(sym) != None:
inflClasses.append(sym)
if len(inflClasses) > 1:
print("SEVERAL INFLECTIONAL CLASSES FOUND! NOT POSSIBLE!") # TODO: error handling
return None # TODO: or return None ?! or all? inflClasses
if len(inflClasses) == 0:
print("no inflectional class could be found")
return None
# when here, everything is ok
return inflClasses[0]