Ludovic Moncla
Update app.py
db0157d
import gradio as gr
import networkx as nx
import matplotlib.pyplot as plt
from rdflib import Namespace, Graph, Literal
from rdflib.namespace import RDF, RDFS, XSD
from pyvis.network import Network
from transformers import pipeline
from utils import *
import plotly.graph_objects as go
import tempfile
domain_type_classifier = pipeline("text-classification", model="GEODE/bert-base-multilingual-cased-edda-domain-classification", truncation=True)
entry_type_classifier = pipeline("text-classification", model="GEODE/bert-base-multilingual-cased-geography-entry-classification", truncation=True)
place_type_classifier = pipeline("text-classification", model="GEODE/bert-base-multilingual-cased-place-entry-classification", truncation=True)
cardinality_classifier = pipeline("text-classification", model="GEODE/bert-base-multilingual-cased-single-multiple-place-classification", truncation=True)
ner = pipeline("token-classification", model="GEODE/camembert-base-edda-span-classification", aggregation_strategy="simple")
placename_classifier = pipeline("text-classification", model="GEODE/bert-base-multilingual-cased-classification-ner", truncation=True)
relation_classifier = pipeline("text-classification", model="GEODE/bert-base-multilingual-cased-classification-relation", truncation=True)
generator = pipeline("text2text-generation", model="GEODE/mt5-small-coords-norm", truncation=True)
def create_map(lat, long):
fig = go.Figure(go.Scattermapbox(
lat=[lat],
lon=[long],
mode='markers',
marker=go.scattermapbox.Marker(
size=20
),
))
fig.update_layout(
mapbox_style="open-street-map",
mapbox=dict(
bearing=0,
center=go.layout.mapbox.Center(
lat=lat,
lon=long
),
pitch=0,
zoom=6
),
margin=dict(l=0, r=0, t=0, b=0)
)
return fig
def build_visualization(graph, height="1000px", width="100%"):
"""Convert the rdflib graph into a pyvis network graph for visualization"""
net = Network(
height=height,
width=width,
directed=True,
neighborhood_highlight=True,
notebook=True,
cdn_resources="in_line"
)
net.force_atlas_2based()
net.toggle_physics(True)
for subj, pred, obj in graph:
net.add_node(str(subj), label=str(subj).split("/")[-1], title=str(subj), shape="box")
if isinstance(obj, Literal):
net.add_node(str(obj), label=str(obj), title=str(obj), shape="ellipse", color="orange")
else:
net.add_node(str(obj), label=str(obj).split("/")[-1], title=str(obj), shape="box")
net.add_edge(str(subj), str(obj), label=str(pred).split("/")[-1])
return net
def create_graph_viz(g):
net = build_visualization(g)
html_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".html")
net.write_html(html_tmp.name)
html_tmp.close()
# Create NetworkX graph for static image
G = nx.DiGraph()
for subj, pred, obj in g:
G.add_node(str(subj), label=str(subj).split("/")[-1])
if isinstance(obj, Literal):
G.add_node(str(obj), label=str(obj))
else:
G.add_node(str(obj), label=str(obj).split("/")[-1])
G.add_edge(str(subj), str(obj), label=str(pred).split("/")[-1])
# Save static image
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_size=2000, node_color="skyblue", font_size=8, arrows=True)
edge_labels = nx.get_edge_attributes(G, "label")
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8)
img_path = "knowledge-graph.png"
plt.savefig(img_path, dpi=100, bbox_inches="tight")
plt.close()
return img_path, html_tmp.name
# Reconstituer tout le texte avec des spans continus
def build_complete_spans(text, spans):
spans_sorted = sorted(spans, key=lambda x: x["start"])
result = []
last_end = 0
for s in spans_sorted:
start, end = s["start"], s["end"]
# Partie sans entité avant le span
if start > last_end:
result.append((text[last_end:start], None))
# Partie avec entité
result.append((text[start:end], s["entity_group"]))
last_end = end
# Texte restant après la dernière entité
if last_end < len(text):
result.append((text[last_end:], None))
return result
def run_pipeline(text):
entry_type = [{'label':'N/A'}]
place_type = [{'label':'N/A'}]
cardinality = [{'label':'N/A'}]
placenames = None
relations = None
#head = "N/A"
img_path = None
html_path = None
coords = None
places_md = ""
relations_md = ""
g = None
spans = ner(text)
for span in spans:
if span['entity_group'] == 'Head':
# if span['word'] contains (, stop at that point
if '(' in span['word']:
head = span['word'].split('(')[0].strip()
else:
head = span['word']
break
domain = domain_type_classifier(text)
if domain[0]['label'] == 'Géographie':
entry_type = entry_type_classifier(text)
#print(entry_type)
placenames = []
relations = []
entity_uris = {}
statement_uris = {}
if entry_type[0]['label'] == 'Place':
place_type = place_type_classifier(text)
cardinality = cardinality_classifier(text)
if cardinality[0]['label'] == 'Single':
EKG = Namespace("http://encyclokg.geo/")
SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")
g = Graph()
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("ekg", EKG)
g.bind("skos", SKOS)
uri_article = f"Entity{len(entity_uris)}"
latlong = False
aliases = None
# first extract placenames and head and create their URIs before relations to ensure all entities are in the graph
for span in spans:
if span['entity_group'] == 'Head':
# if span['word'] contains (, stop at that point
if '(' in span['word']:
head = span['word'].split('(')[0].strip()
else:
head = span['word']
aliases = segmentation_head(head)
print("aliases",aliases)
if span['entity_group'] == 'NP_Spatial' and len(span['word'])>1:
word, context, label = get_context(text, span, ngram_context_size=5)
label = placename_classifier(context)
placenames.append({'placename': word, 'label': label[0]['label']})
uri_entity = f"Entity{len(entity_uris)+1}"
span['uri'] = uri_entity
span['label'] = label[0]['label']
entity_uris[word] = uri_entity
add_triplet(g, EKG[uri_entity], RDF.type, EKG[label[0]['label']])
if word.startswith("l'") or word.startswith("d'"):
word = word[2:]
add_triplet(g, EKG[uri_entity], SKOS['prefLabel'], Literal(word))
if span['entity_group'] == 'Latlong':
latlong = True
entity_uris[head] = uri_article
add_triplet(g, EKG[uri_article], RDF.type, EKG[place_type[0]['label']])
if aliases:
if len(aliases) > 0:
add_triplet(g, EKG[uri_article], SKOS['prefLabel'], Literal(aliases[0]))
for alias in aliases[1:]:
add_triplet(g, EKG[uri_article], SKOS['altLabel'], Literal(alias))
else:
add_triplet(g, EKG[uri_article], SKOS['prefLabel'], Literal(head))
coords = None
if latlong:
predicted_coordinates_from_pipeline = generator(text, max_length=128)
coords = dms_to_dd(predicted_coordinates_from_pipeline[0]['generated_text'])
add_triplet(g, EKG[uri_article], EKG['latitude'], Literal(coords[0], datatype=XSD.float))
add_triplet(g, EKG[uri_article], EKG['longitude'], Literal(coords[1], datatype=XSD.float))
# filter out non NP_Spatial or Relation spans
filtered_spans = [span for span in spans if span['entity_group'] in ['NP_Spatial', 'Relation'] and len(span['word'])>1]
# then extract relations
for idx, span in enumerate(filtered_spans):
if span['entity_group'] == 'Relation':
word, context, label = get_context(text, span, ngram_context_size=5)
label = relation_classifier(context)
relations.append({'relation': word, 'label': label[0]['label']})
if idx > 0 and idx < len(filtered_spans) - 1:
link_to_subject_object(g, place_type[0]['label'], uri_article, label[0]['label'],word, filtered_spans[idx-1], filtered_spans[idx+1], statement_uris, EKG, RDF)
span_object = pattern_starting_article(text, spans)
if span_object is not None:
stmt_uri = EKG[f"Statement{len(statement_uris)}"]
statement_uris["inclusion"] = stmt_uri
add_triplet(g, stmt_uri, RDF.subject, EKG[uri_article])
add_triplet(g, stmt_uri, RDF.object, EKG[entity_uris[span_object['word']]])
add_triplet(g, stmt_uri, RDF.predicate, EKG["inclusion"])
img_path, html_path = create_graph_viz(g)
if placenames:
places_md = "| Placename | Type |\n|------------|------|\n" + "\n".join(
f"| {p['placename']} | {p['label']} |" for p in placenames
)
else:
places_md = "_No placenames found._"
if relations:
relations_md = "| Relation | Type |\n|-----------|------|\n" + "\n".join(
f"| {r['relation']} | {r['label']} |" for r in relations
)
else:
relations_md = "_No relations found._"
else:
place_type = [{'label':'N/A'}]
places_md = ""
relations_md = ""
else:
place_type = [{'label':'N/A'}]
cardinality = [{'label':'N/A'}]
places_md = ""
relations_md = ""
if g is not None:
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".ttl")
g.serialize(destination=tmp.name, format="turtle")
tmp.close()
else:
tmp = None
highlighted_spans = build_complete_spans(text, spans)
res = f"## Extracted informations for entry: **{head}**\n"
res += f"* Domain of entry (Géographie / Histoire / Médecine...): **{domain[0]['label']}**\n"
res += f"* Type of entry (Place / Person / Other): **{entry_type[0]['label']}**\n"
res += f"* Cardinality (Single / Multiple): **{cardinality[0]['label']}**\n"
res += f"* Type of place (City / River / Mountain / Other...): **{place_type[0]['label']}**\n"
return img_path, html_path, res, highlighted_spans, places_md, relations_md, tmp.name if tmp else None, create_map(coords[0], coords[1]) if coords else None
examples = [
"Jean de Luz, S. (Géog.) Lucius Vicus ; le nom basque est Loitzun, petite ville de France en Gascogne, la deuxieme du pays de Labour, & la derniere du côté de l'Espagne, avec un port. Elle est sur une petite riviere, que Piganiol de la Force nomme la Ninette, & M. de Lisle le Nivelet, à 4 lieues N. E. de Fontarabie, 4 S. O. de Bayonne, 174 S. O. de Paris. Long. 15. 59. 28. lat. 43. 23. 15. (D. J.)",
"* BIDOUZE, (Géog.) riviere de la Gascogne, qui se jette dans la Gave près de Bayonne.",
#"ISTURIE, (Géog.) petit village à cinq lieues de Bayonne dans le pays-basque, contrée d'Arberou. Je n'en parle que parce qu'il a donné son nom à une fameuse mine connue, & jadis exploitée par les Romains ; son ouverture avoit près de douze cent piés de profondeur. La montagne étoit percée pour l'écoulement des eaux d'une petite riviere qui la traverse : trois grosses tours dont une existe encore en partie, avec un retranchement d'une douzaine de toises de surface, & quelques fortifications au haut de la montagne, servoient à loger des soldats pour soutenir les mineurs. Des naturalistes qui ont examiné cet endroit, croyent que c'étoit une mine de fer, & ont regardé le grand souterrein comme une carriere d'où l'on tiroit la pierre. (D. J.)"
#"NIVE, (Géog.) riviere du royaume de Navarre, appellée Errobi, dans la langue du pays. Elle descend des montagnes de la basse Navarre, se joint avec l'Adour dans les fossés de Bayonne, & va se jetter dans la mer à une lieue de cette ville. (D. J.)",
#"Palais, Saint, (Géog. mod.) petite ville de France dans la basse Navarre, au diocese de Bayonne, sur la Bidouse, à 5 lieues de S. Jean Pié-de-Port, à qui elle dispute l'honneur d'être la capitale de la Navarre. Long. 16. 35. latit. 43. 20.",
"Jean-pied-de-Port, S. (Géog.) ville de France en Gascogne, à une lieue des frontieres d'Espagne, autrefois capitale de la basse Navarre, avec une citadelle sur une hauteur. Antonin appelle ce lieu imus Pyrenoeus, le pié des Pyrénées, parce qu'en effet il est au pié de cette chaîne de montagnes ; dans ce pays-là on appelle port les passages ou défilés par où l'on peut traverser les Pyrénées, & comme cette ville de S. Jean est à l'entrée de ces ports ou passages, on la nomme S. Jean-pied-de port ; elle est sur la Nive, à l'entrée d'un des passages des Pyrénées, à 8 lieues S. E. de Bayonne, 12 N. E. de Pampelune, 176 S. O. de Paris. Long. 16. 22. lat. 43. 8. (D. J.)",
#"PAU, (Géog. mod.) ville de France, regardée comme capitale du Béarn, avec un parlement, une chambre des comptes, & une cour des aides, unies au parlement, une sénéchaussée, un hôtel des monnoies. Elle est sur une hauteur, au pié de laquelle passe le Gave béarnois, à 10 lieues O. de Tarbes, 12 S. d'Aire, 39 S. de Bordeaux, 167 S. O. de Paris. Long. suivant Cassini, 17d. 22'. 30\". lat. 43d. 15'. Henri IV naquit à Pau, le 13 Décembre 1553, dans le château qui est au bout de la ville.",
]
with gr.Blocks(css="""
footer{display:none !important}
""") as demo:
gr.Markdown("# GeoKnowledge Graph Builder from encyclopedic entries\n\n")
gr.Markdown("This application extracts geographic entities and their spatial relations from encyclopedic text entries and builds a geo-knowledge graph in RDF format, along with visualizations and map outputs (when coordinates are extracted).\n" \
"The proposed pipeline combines supervised learning and large language models for article classification, entity typing, and spatial relation extraction.\n")
gr.Markdown("You can find more details about the models and the methodology in the [project webpage](https://gitlab.com/no-name-research/project).\n\n")
with gr.Row():
with gr.Column(scale=6):
inp = gr.Textbox(
label="Enter text",
placeholder="e.g. * BIDOUZE, (Géog.) riviere de la Gascogne, qui se jette dans la Gave près de Bayonne.",
lines=3
)
run_btn = gr.Button("Run pipeline")
with gr.Column(scale=4):
gr.Examples(
examples=examples,
inputs=inp,
label="Examples"
)
gr.HTML(f"Search for more examples on <a href='https://artflsrv04.uchicago.edu/philologic4.7/encyclopedie0922/' target='_blank'>ARTFL Encyclopedie</a>")
with gr.Row():
with gr.Column():
out_md = gr.Markdown("## Extracted informations")
spans = gr.HighlightedText(label="Named Entity Recognition")
with gr.Row():
with gr.Column():
out_places = gr.Markdown(label="Placenames and their type (City/River/Mountain/Other...)")
with gr.Column():
out_relations = gr.Markdown(label="Spatial relations and their type (Adjacency/Orientation/Distance/Other...)")
with gr.Column():
with gr.Row():
gr.Markdown(f"## RDF graph and map outputs")
with gr.Row():
with gr.Column():
html_output = gr.File(label="Download Interactive Graph (HTML)", file_types=[".html"])
rdf_file = gr.File(label="Download RDF file (Turtle)", file_types=[".ttl"])
out_map = gr.Plot(label="Coordinates on Map")
with gr.Column():
img_output = gr.Image(label="Static Graph Preview", type="filepath")
run_btn.click(fn=run_pipeline, inputs=inp, outputs=[img_output, html_output, out_md, spans, out_places, out_relations, rdf_file, out_map])
if __name__ == "__main__":
demo.launch()