# -*- coding: utf-8 -*-
# Автор: Гусев Илья
# Описание: Служебные функции и константы.
import re
CYRRILIC_LOWER_VOWELS = "аоэиуыеёюя"
CYRRILIC_LOWER_CONSONANTS = "йцкнгшщзхъфвпрлджчсмтьб"
VOWELS = "aeiouAEIOUаоэиуыеёюяАОЭИУЫЕЁЮЯ"
CLOSED_SYLLABLE_CHARS = "рлймнРЛЙМН"
[docs]def text_to_wordlist(sentence, cyrillic=False):
regexp = "[^а-яА-Яёa-zA-Z]"
if cyrillic:
regexp = "[^а-яА-Яё]"
sentence = re.sub(regexp, " ", sentence)
result = sentence.lower().split()
return result
[docs]def text_to_sentences(text):
regexp = "[\.\?!](?=[\s\n]*[A-ZА-Я])|;|:-|:—|:—|: —|: —|: -"
regexps = ["(?<=[^A-zА-я][A-ZА-Я])\.",
"(?<=[^A-zА-я][A-zА-я])\.[ ]?(?=[A-zА-я][^A-zА-я])",
"\.(?=,)"
]
for reg in regexps:
text = "$".join(re.split(reg,text))
result = re.split(regexp, text)
result = map(lambda x: x.strip().replace("$", "."), result)
return result
[docs]def to_cyrrilic(text):
return text.replace("x", "х") \
.replace("a", "а") \
.replace("y", "у") \
.replace("o", "о") \
.replace("c", "с") \
.replace("ё", "е")
[docs]def normilize_line(text):
regexp = "[^а-яА-Яёa-zA-Z0-9]"
text = re.sub(regexp, " ", text)
result = to_cyrrilic("".join(text.lower().split()))
return result
[docs]def count_vowels(string):
num_vowels = 0
for char in string:
if char in VOWELS:
num_vowels += 1
return num_vowels
[docs]def get_first_vowel_position(string):
for i, ch in enumerate(string):
if ch in VOWELS:
return i
return -1
[docs]def etree_to_dict(t):
return {t.tag: map(etree_to_dict, t.iterchildren()) or t.text}