Source code for rupo.util.preprocess

# -*- coding: utf-8 -*-
# Автор: Гусев Илья
# Описание: Служебные функции и константы.

import re

CYRRILIC_LOWER_VOWELS = "аоэиуыеёюя"
CYRRILIC_LOWER_CONSONANTS = "йцкнгшщзхъфвпрлджчсмтьб"
VOWELS = "aeiouAEIOUаоэиуыеёюяАОЭИУЫЕЁЮЯ"
CLOSED_SYLLABLE_CHARS = "рлймнРЛЙМН"


[docs]def text_to_wordlist(sentence, cyrillic=False):
    regexp = "[^а-яА-Яёa-zA-Z]"
    if cyrillic:
        regexp = "[^а-яА-Яё]"
    sentence = re.sub(regexp, " ", sentence)
    result = sentence.lower().split()
    return result


[docs]def text_to_sentences(text):
    regexp = "[\.\?!](?=[\s\n]*[A-ZА-Я])|;|:-|:—|:—|: —|: —|: -"
    regexps = ["(?<=[^A-zА-я][A-ZА-Я])\.", 
                "(?<=[^A-zА-я][A-zА-я])\.[ ]?(?=[A-zА-я][^A-zА-я])",
                "\.(?=,)"
                ]
    for reg in regexps:
        text = "$".join(re.split(reg,text))

    result = re.split(regexp, text)
    result = map(lambda x: x.strip().replace("$", "."), result)
    return result


[docs]def to_cyrrilic(text):
    return text.replace("x", "х") \
        .replace("a", "а") \
        .replace("y", "у") \
        .replace("o", "о") \
        .replace("c", "с") \
        .replace("ё", "е")


[docs]def normilize_line(text):
    regexp = "[^а-яА-Яёa-zA-Z0-9]"
    text = re.sub(regexp, " ", text)
    result = to_cyrrilic("".join(text.lower().split()))
    return result


[docs]def count_vowels(string):
    num_vowels = 0
    for char in string:
        if char in VOWELS:
            num_vowels += 1
    return num_vowels


[docs]def get_first_vowel_position(string):
    for i, ch in enumerate(string):
        if ch in VOWELS:
            return i
    return -1


[docs]def etree_to_dict(t):
    return {t.tag: map(etree_to_dict, t.iterchildren()) or t.text}