Spaces:

vitaly
/

bibliography-parser

Build error

App Files Files Community

vitaly commited on Jul 17, 2022

Commit

b303a60

1 Parent(s): 0f8d97e

Upload bib_tokenizers.py

Browse files

Files changed (1) hide show

bib_tokenizers.py +56 -0

bib_tokenizers.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import spacy
+from spacy.lang.char_classes import LIST_ELLIPSES
+from spacy.lang.char_classes import HYPHENS, LIST_ICONS
+from spacy.lang.char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    PUNCT,
+)
+from spacy.util import compile_infix_regex, registry
+infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+        # [10]R. Nakamura and T. Kenzaka
+        # (10)R. Nakamura and T. Kenzaka
+        # 10.R. Nakamura and T. Kenzaka
+        r"(?<=[{a}0-9])[:<>\]\)\.](?=[{a}])".format(a=ALPHA),
+        # 10R. Nakamura and T. Kenzaka
+        r"(?<=[0-9])(?=[{a}])".format(a=ALPHA),
+    ]
+)
+@spacy.registry.tokenizers("references_tokenizer")
+def create_references_tokenizer():
+    def create_tokenizer(nlp):
+        default_tokenizer_cfg = {"tokenizer": {"@tokenizers": "spacy.Tokenizer.v1"}}
+        create_default_tokenizer = registry.resolve(default_tokenizer_cfg)["tokenizer"]
+        tokenizer = create_default_tokenizer(nlp)
+        infix_re = compile_infix_regex(infixes)
+        tokenizer.infix_finditer = infix_re.finditer
+        for s in [
+            "[10]R. Nakamura and T. Kenzaka",
+            "(10)R. Nakamura and T. Kenzaka",
+            "10.R. Nakamura and T. Kenzaka",
+            "10R. Nakamura and T. Kenzaka",
+        ]:
+            print(s, "->", [t for t in tokenizer(s)])
+        return tokenizer
+    return create_tokenizer