Spaces:

GEODE
/

encyclopedia-to-geokg

Sleeping

App Files Files Community

Ludovic Moncla commited on Oct 21

Commit

91d8ac2

1 Parent(s): 9a4bad6

Update utils.py

Browse files

Files changed (1) hide show

utils.py +26 -16

utils.py CHANGED Viewed

@@ -176,6 +176,7 @@ def dms_to_dd(dms):
 def is_partitive_expression(text, spans, i, j):
     if spans[i]['entity_group'] == 'NC_Spatial' and spans[j]['entity_group'] == 'NP_Spatial':
         boundaries = [spans[i]['end'], spans[j]['start']]
         if " de " in text[boundaries[0]:boundaries[1]] or " d'" in text[boundaries[0]:boundaries[1]] or " du " in text[boundaries[0]:boundaries[1]]:
             return True
     return False
@@ -186,9 +187,13 @@ def pattern_starting_article(text, spans):
             if spans[1]['entity_group'] == 'Domain-mark':
                 if is_partitive_expression(text, spans, 2, 3):
                     return spans[3]
             else:
                 if is_partitive_expression(text, spans, 1, 2):
                     return spans[2]
     return None
@@ -238,46 +243,51 @@ def segmentation_head(head):
                         "la cité de la","pays des quatre","fontaine de","pays de","port de","état de","Motte de","les quatre","pays de","golphe","duché d'","détroit d'","seigneurie d'","isles","église de","salle de","l'ile","principauté de","île d'","cap de","isle de l'","Isle de l'"]
     list_saint = ["saint","sainte","Saint","Sainte","San","san","sant","Sant","S.","St.","St","Saint-","saint-","sainte-","Sainte-"]
-    # XXX,XXX
     match = re.fullmatch(r"([{}]+),([{}]+)$".format(pattern, pattern), head, re.IGNORECASE)
     if match:
         label1, label2 = match.groups()
-        if label2 not in list_prefixe:
-            return [label1.strip(), label2.strip()]
         else:
-            if label2 not in list_saint:
-                return [label1.strip(), label2.strip()+" "+label1.strip()]
-            else:
-                return [label2.strip()+" "+label1.strip()]
     # XXX (xxx)
     match = re.fullmatch(r"([{}]+)\s\(([{}]+)\)$".format(pattern, pattern), head, re.IGNORECASE)
     if match:
         nom, prefixe = match.groups()
         if prefixe not in list_saint:
-            return [nom.strip(), prefixe.strip()+" "+nom.strip()]
         else:
-            return [prefixe.strip()+" "+nom.strip()]
     #XXX,(xxx)
     match = re.fullmatch(r"([{}]+),\(([{}]+)\)$".format(pattern, pattern), head, re.IGNORECASE)
     if match:
         nom, prefixe = match.groups()
         if prefixe not in list_saint:
-            return [nom.strip(), prefixe.strip() + " " + nom.strip()]
         else:
-            return [prefixe.strip() + " " + nom.strip()]
     # XXX le/les/...
     match = re.fullmatch(r"([{}]+)\s(L'|la nouvelle|Vallée d'|isles de l'|isles des|l'ile de|la|colonnes d'|Monts|le|les|Les|les îles d'|l'Isle de|Baie d'|lac des|l'|lac d'|Sant|isle|Isle|mare|val|la terre|Santa|Colonia|île de la|Golfe|mons|terre des|palus|San|Mont|royaume de|sinus|alpes|Montes|flumen|Nemaviae|lucus|portus|aquae|pays de|la vallée de|isles de Scopelo|le cap de|vicus|cap de|Civitas|porte|insula|terre de|le chatel|san|lac|promontorium|oppidum|Iles|état de|ville de|la rade de|templum|fanum|le grand|le petit|Préfecture de|le comté de|l'île de|bailliage d'|comté de|regio)$", head,
                           re.IGNORECASE)
     if match:
         nom, prefixe = match.groups()
         if prefixe not in list_saint:
-            return [nom.strip(), prefixe.strip() + " " + nom.strip()]
         else:
-            return [prefixe.strip() + " " + nom.strip()]

 def is_partitive_expression(text, spans, i, j):
     if spans[i]['entity_group'] == 'NC_Spatial' and spans[j]['entity_group'] == 'NP_Spatial':
         boundaries = [spans[i]['end'], spans[j]['start']]
+        print("####partitive boundaries",boundaries)
         if " de " in text[boundaries[0]:boundaries[1]] or " d'" in text[boundaries[0]:boundaries[1]] or " du " in text[boundaries[0]:boundaries[1]]:
             return True
     return False
             if spans[1]['entity_group'] == 'Domain-mark':
                 if is_partitive_expression(text, spans, 2, 3):
                     return spans[3]
+                elif is_partitive_expression(text, spans, 3, 4):
+                    return spans[4]
             else:
                 if is_partitive_expression(text, spans, 1, 2):
                     return spans[2]
+                elif is_partitive_expression(text, spans, 2, 3):
+                    return spans[3]
     return None
                         "la cité de la","pays des quatre","fontaine de","pays de","port de","état de","Motte de","les quatre","pays de","golphe","duché d'","détroit d'","seigneurie d'","isles","église de","salle de","l'ile","principauté de","île d'","cap de","isle de l'","Isle de l'"]
     list_saint = ["saint","sainte","Saint","Sainte","San","san","sant","Sant","S.","St.","St","Saint-","saint-","sainte-","Sainte-"]
+    # XXX, XXX
     match = re.fullmatch(r"([{}]+),([{}]+)$".format(pattern, pattern), head, re.IGNORECASE)
     if match:
         label1, label2 = match.groups()
+        # remove leading and trailing spaces
+        label1 = label1.strip()
+        label2 = label2.strip()
+        print("***************label1,label2:[",label1,"][", label2,"]")
+        if label2 not in list_prefixe and label2 not in list_saint:
+            return [label1, label2]
         else:
+            return [label2+" "+label1]
     # XXX (xxx)
     match = re.fullmatch(r"([{}]+)\s\(([{}]+)\)$".format(pattern, pattern), head, re.IGNORECASE)
     if match:
         nom, prefixe = match.groups()
+        nom = nom.strip()
+        prefixe = prefixe.strip()
         if prefixe not in list_saint:
+            return [nom, prefixe+" "+nom]
         else:
+            return [prefixe+" "+nom]
     #XXX,(xxx)
     match = re.fullmatch(r"([{}]+),\(([{}]+)\)$".format(pattern, pattern), head, re.IGNORECASE)
     if match:
         nom, prefixe = match.groups()
+        nom = nom.strip()
+        prefixe = prefixe.strip()
         if prefixe not in list_saint:
+            return [nom, prefixe+" "+nom]
         else:
+            return [prefixe+" "+nom]
     # XXX le/les/...
     match = re.fullmatch(r"([{}]+)\s(L'|la nouvelle|Vallée d'|isles de l'|isles des|l'ile de|la|colonnes d'|Monts|le|les|Les|les îles d'|l'Isle de|Baie d'|lac des|l'|lac d'|Sant|isle|Isle|mare|val|la terre|Santa|Colonia|île de la|Golfe|mons|terre des|palus|San|Mont|royaume de|sinus|alpes|Montes|flumen|Nemaviae|lucus|portus|aquae|pays de|la vallée de|isles de Scopelo|le cap de|vicus|cap de|Civitas|porte|insula|terre de|le chatel|san|lac|promontorium|oppidum|Iles|état de|ville de|la rade de|templum|fanum|le grand|le petit|Préfecture de|le comté de|l'île de|bailliage d'|comté de|regio)$", head,
                           re.IGNORECASE)
     if match:
         nom, prefixe = match.groups()
+        nom = nom.strip()
+        prefixe = prefixe.strip()
         if prefixe not in list_saint:
+            return [nom, prefixe + " " + nom]
         else:
+            return [prefixe + " " + nom]