Spaces:
Sleeping
Sleeping
Ludovic Moncla
commited on
Commit
·
91d8ac2
1
Parent(s):
9a4bad6
Update utils.py
Browse files
utils.py
CHANGED
|
@@ -176,6 +176,7 @@ def dms_to_dd(dms):
|
|
| 176 |
def is_partitive_expression(text, spans, i, j):
|
| 177 |
if spans[i]['entity_group'] == 'NC_Spatial' and spans[j]['entity_group'] == 'NP_Spatial':
|
| 178 |
boundaries = [spans[i]['end'], spans[j]['start']]
|
|
|
|
| 179 |
if " de " in text[boundaries[0]:boundaries[1]] or " d'" in text[boundaries[0]:boundaries[1]] or " du " in text[boundaries[0]:boundaries[1]]:
|
| 180 |
return True
|
| 181 |
return False
|
|
@@ -186,9 +187,13 @@ def pattern_starting_article(text, spans):
|
|
| 186 |
if spans[1]['entity_group'] == 'Domain-mark':
|
| 187 |
if is_partitive_expression(text, spans, 2, 3):
|
| 188 |
return spans[3]
|
|
|
|
|
|
|
| 189 |
else:
|
| 190 |
if is_partitive_expression(text, spans, 1, 2):
|
| 191 |
return spans[2]
|
|
|
|
|
|
|
| 192 |
return None
|
| 193 |
|
| 194 |
|
|
@@ -238,46 +243,51 @@ def segmentation_head(head):
|
|
| 238 |
"la cité de la","pays des quatre","fontaine de","pays de","port de","état de","Motte de","les quatre","pays de","golphe","duché d'","détroit d'","seigneurie d'","isles","église de","salle de","l'ile","principauté de","île d'","cap de","isle de l'","Isle de l'"]
|
| 239 |
list_saint = ["saint","sainte","Saint","Sainte","San","san","sant","Sant","S.","St.","St","Saint-","saint-","sainte-","Sainte-"]
|
| 240 |
|
| 241 |
-
# XXX,XXX
|
| 242 |
match = re.fullmatch(r"([{}]+),([{}]+)$".format(pattern, pattern), head, re.IGNORECASE)
|
| 243 |
if match:
|
|
|
|
| 244 |
label1, label2 = match.groups()
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
| 248 |
else:
|
| 249 |
-
|
| 250 |
-
return [label1.strip(), label2.strip()+" "+label1.strip()]
|
| 251 |
-
else:
|
| 252 |
-
return [label2.strip()+" "+label1.strip()]
|
| 253 |
|
| 254 |
# XXX (xxx)
|
| 255 |
match = re.fullmatch(r"([{}]+)\s\(([{}]+)\)$".format(pattern, pattern), head, re.IGNORECASE)
|
| 256 |
if match:
|
| 257 |
nom, prefixe = match.groups()
|
| 258 |
-
|
|
|
|
| 259 |
if prefixe not in list_saint:
|
| 260 |
-
return [nom
|
| 261 |
else:
|
| 262 |
-
return [prefixe
|
| 263 |
|
| 264 |
|
| 265 |
#XXX,(xxx)
|
| 266 |
match = re.fullmatch(r"([{}]+),\(([{}]+)\)$".format(pattern, pattern), head, re.IGNORECASE)
|
| 267 |
if match:
|
| 268 |
nom, prefixe = match.groups()
|
|
|
|
|
|
|
| 269 |
if prefixe not in list_saint:
|
| 270 |
-
return [nom
|
| 271 |
else:
|
| 272 |
-
return [prefixe
|
| 273 |
|
| 274 |
# XXX le/les/...
|
| 275 |
match = re.fullmatch(r"([{}]+)\s(L'|la nouvelle|Vallée d'|isles de l'|isles des|l'ile de|la|colonnes d'|Monts|le|les|Les|les îles d'|l'Isle de|Baie d'|lac des|l'|lac d'|Sant|isle|Isle|mare|val|la terre|Santa|Colonia|île de la|Golfe|mons|terre des|palus|San|Mont|royaume de|sinus|alpes|Montes|flumen|Nemaviae|lucus|portus|aquae|pays de|la vallée de|isles de Scopelo|le cap de|vicus|cap de|Civitas|porte|insula|terre de|le chatel|san|lac|promontorium|oppidum|Iles|état de|ville de|la rade de|templum|fanum|le grand|le petit|Préfecture de|le comté de|l'île de|bailliage d'|comté de|regio)$", head,
|
| 276 |
re.IGNORECASE)
|
| 277 |
if match:
|
| 278 |
nom, prefixe = match.groups()
|
|
|
|
|
|
|
| 279 |
if prefixe not in list_saint:
|
| 280 |
-
return [nom
|
| 281 |
-
|
| 282 |
else:
|
| 283 |
-
return [prefixe
|
|
|
|
| 176 |
def is_partitive_expression(text, spans, i, j):
|
| 177 |
if spans[i]['entity_group'] == 'NC_Spatial' and spans[j]['entity_group'] == 'NP_Spatial':
|
| 178 |
boundaries = [spans[i]['end'], spans[j]['start']]
|
| 179 |
+
print("####partitive boundaries",boundaries)
|
| 180 |
if " de " in text[boundaries[0]:boundaries[1]] or " d'" in text[boundaries[0]:boundaries[1]] or " du " in text[boundaries[0]:boundaries[1]]:
|
| 181 |
return True
|
| 182 |
return False
|
|
|
|
| 187 |
if spans[1]['entity_group'] == 'Domain-mark':
|
| 188 |
if is_partitive_expression(text, spans, 2, 3):
|
| 189 |
return spans[3]
|
| 190 |
+
elif is_partitive_expression(text, spans, 3, 4):
|
| 191 |
+
return spans[4]
|
| 192 |
else:
|
| 193 |
if is_partitive_expression(text, spans, 1, 2):
|
| 194 |
return spans[2]
|
| 195 |
+
elif is_partitive_expression(text, spans, 2, 3):
|
| 196 |
+
return spans[3]
|
| 197 |
return None
|
| 198 |
|
| 199 |
|
|
|
|
| 243 |
"la cité de la","pays des quatre","fontaine de","pays de","port de","état de","Motte de","les quatre","pays de","golphe","duché d'","détroit d'","seigneurie d'","isles","église de","salle de","l'ile","principauté de","île d'","cap de","isle de l'","Isle de l'"]
|
| 244 |
list_saint = ["saint","sainte","Saint","Sainte","San","san","sant","Sant","S.","St.","St","Saint-","saint-","sainte-","Sainte-"]
|
| 245 |
|
| 246 |
+
# XXX, XXX
|
| 247 |
match = re.fullmatch(r"([{}]+),([{}]+)$".format(pattern, pattern), head, re.IGNORECASE)
|
| 248 |
if match:
|
| 249 |
+
|
| 250 |
label1, label2 = match.groups()
|
| 251 |
+
# remove leading and trailing spaces
|
| 252 |
+
label1 = label1.strip()
|
| 253 |
+
label2 = label2.strip()
|
| 254 |
+
print("***************label1,label2:[",label1,"][", label2,"]")
|
| 255 |
+
if label2 not in list_prefixe and label2 not in list_saint:
|
| 256 |
+
return [label1, label2]
|
| 257 |
else:
|
| 258 |
+
return [label2+" "+label1]
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
# XXX (xxx)
|
| 261 |
match = re.fullmatch(r"([{}]+)\s\(([{}]+)\)$".format(pattern, pattern), head, re.IGNORECASE)
|
| 262 |
if match:
|
| 263 |
nom, prefixe = match.groups()
|
| 264 |
+
nom = nom.strip()
|
| 265 |
+
prefixe = prefixe.strip()
|
| 266 |
if prefixe not in list_saint:
|
| 267 |
+
return [nom, prefixe+" "+nom]
|
| 268 |
else:
|
| 269 |
+
return [prefixe+" "+nom]
|
| 270 |
|
| 271 |
|
| 272 |
#XXX,(xxx)
|
| 273 |
match = re.fullmatch(r"([{}]+),\(([{}]+)\)$".format(pattern, pattern), head, re.IGNORECASE)
|
| 274 |
if match:
|
| 275 |
nom, prefixe = match.groups()
|
| 276 |
+
nom = nom.strip()
|
| 277 |
+
prefixe = prefixe.strip()
|
| 278 |
if prefixe not in list_saint:
|
| 279 |
+
return [nom, prefixe+" "+nom]
|
| 280 |
else:
|
| 281 |
+
return [prefixe+" "+nom]
|
| 282 |
|
| 283 |
# XXX le/les/...
|
| 284 |
match = re.fullmatch(r"([{}]+)\s(L'|la nouvelle|Vallée d'|isles de l'|isles des|l'ile de|la|colonnes d'|Monts|le|les|Les|les îles d'|l'Isle de|Baie d'|lac des|l'|lac d'|Sant|isle|Isle|mare|val|la terre|Santa|Colonia|île de la|Golfe|mons|terre des|palus|San|Mont|royaume de|sinus|alpes|Montes|flumen|Nemaviae|lucus|portus|aquae|pays de|la vallée de|isles de Scopelo|le cap de|vicus|cap de|Civitas|porte|insula|terre de|le chatel|san|lac|promontorium|oppidum|Iles|état de|ville de|la rade de|templum|fanum|le grand|le petit|Préfecture de|le comté de|l'île de|bailliage d'|comté de|regio)$", head,
|
| 285 |
re.IGNORECASE)
|
| 286 |
if match:
|
| 287 |
nom, prefixe = match.groups()
|
| 288 |
+
nom = nom.strip()
|
| 289 |
+
prefixe = prefixe.strip()
|
| 290 |
if prefixe not in list_saint:
|
| 291 |
+
return [nom, prefixe + " " + nom]
|
|
|
|
| 292 |
else:
|
| 293 |
+
return [prefixe + " " + nom]
|