Spaces:
Runtime error
Runtime error
| # IMPORTANT NOTE: DO NOT DIRECTLY EDIT THIS FILE | |
| # This file was manually ported from `normalize-punctuation.perl` | |
| # TODO: Only supports English, add others | |
| import regex as re | |
| multispace_regex = re.compile("[ ]{2,}") | |
| multidots_regex = re.compile(r"\.{2,}") | |
| end_bracket_space_punc_regex = re.compile(r"\) ([\.!:?;,])") | |
| digit_space_percent = re.compile(r"(\d) %") | |
| double_quot_punc = re.compile(r"\"([,\.]+)") | |
| digit_nbsp_digit = re.compile(r"(\d) (\d)") | |
| def punc_norm(text, lang="en"): | |
| text = text.replace('\r', '') \ | |
| .replace('(', " (") \ | |
| .replace(')', ") ") \ | |
| \ | |
| .replace("( ", "(") \ | |
| .replace(" )", ")") \ | |
| \ | |
| .replace(" :", ':') \ | |
| .replace(" ;", ';') \ | |
| .replace('`', "'") \ | |
| \ | |
| .replace('„', '"') \ | |
| .replace('“', '"') \ | |
| .replace('”', '"') \ | |
| .replace('–', '-') \ | |
| .replace('—', " - ") \ | |
| .replace('´', "'") \ | |
| .replace('‘', "'") \ | |
| .replace('‚', "'") \ | |
| .replace('’', "'") \ | |
| .replace("''", "\"") \ | |
| .replace("´´", '"') \ | |
| .replace('…', "...") \ | |
| .replace(" « ", " \"") \ | |
| .replace("« ", '"') \ | |
| .replace('«', '"') \ | |
| .replace(" » ", "\" ") \ | |
| .replace(" »", '"') \ | |
| .replace('»', '"') \ | |
| .replace(" %", '%') \ | |
| .replace("nº ", "nº ") \ | |
| .replace(" :", ':') \ | |
| .replace(" ºC", " ºC") \ | |
| .replace(" cm", " cm") \ | |
| .replace(" ?", '?') \ | |
| .replace(" !", '!') \ | |
| .replace(" ;", ';') \ | |
| .replace(", ", ", ") \ | |
| text = multispace_regex.sub(' ', text) | |
| text = multidots_regex.sub('.', text) | |
| text = end_bracket_space_punc_regex.sub(r")\1", text) | |
| text = digit_space_percent.sub(r"\1%", text) | |
| text = double_quot_punc.sub(r'\1"', text) # English "quotation," followed by comma, style | |
| text = digit_nbsp_digit.sub(r"\1.\2", text) # What does it mean? | |
| return text.strip(' ') |