Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,15 +5,17 @@ import requests
|
|
| 5 |
import gradio as gr
|
| 6 |
import numpy as np
|
| 7 |
import nltk
|
| 8 |
-
import nltkmodule
|
| 9 |
from newspaper import Article
|
| 10 |
from nltk.tokenize import sent_tokenize
|
| 11 |
-
import xml.etree.ElementTree as ET
|
| 12 |
from sentence_transformers import SentenceTransformer, util
|
| 13 |
import spacy
|
| 14 |
import en_core_sci_lg
|
| 15 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
| 16 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
# --- Models (load once, globally) ---
|
| 19 |
scispacy = en_core_sci_lg.load()
|
|
@@ -27,13 +29,35 @@ NLI_LABELS = ['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT']
|
|
| 27 |
PUBMED_N = 100
|
| 28 |
TOP_ABSTRACTS = 10
|
| 29 |
|
| 30 |
-
# ---
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
# --- Utility: get robust keybert-style query ---
|
| 39 |
def get_keybert_query(text, top_n=10):
|
|
@@ -52,48 +76,6 @@ def get_keybert_query(text, top_n=10):
|
|
| 52 |
query = " OR ".join(f'"{kw}"' for kw in keywords)
|
| 53 |
return query
|
| 54 |
|
| 55 |
-
# --- PubMed retrieval ---
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
def retrieve_pubmed_abstracts_simple(text, n=100, fallback_headline=None):
|
| 59 |
-
query = get_keybert_query(text, top_n=7)
|
| 60 |
-
if not query or query.strip() == '""':
|
| 61 |
-
query = fallback_headline
|
| 62 |
-
print("Trying PubMed query:", query)
|
| 63 |
-
ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
| 64 |
-
tried_queries = [q for q in [query, fallback_headline, text] if q]
|
| 65 |
-
|
| 66 |
-
for q in tried_queries:
|
| 67 |
-
# Always request XML, never parse as JSON or HTML
|
| 68 |
-
search_url = f"{ncbi_url}esearch.fcgi?db=pubmed&term={q}&retmax={n}&sort=relevance&retmode=xml"
|
| 69 |
-
r = requests.get(search_url)
|
| 70 |
-
try:
|
| 71 |
-
root = ET.fromstring(r.text)
|
| 72 |
-
pmids = [el.text for el in root.findall('.//Id')]
|
| 73 |
-
except Exception as e:
|
| 74 |
-
print(f"Failed to parse PMIDs for query '{q}': {e}")
|
| 75 |
-
pmids = []
|
| 76 |
-
print(f"Query: {q} => {len(pmids)} PMIDs")
|
| 77 |
-
if pmids:
|
| 78 |
-
ids = ','.join(pmids)
|
| 79 |
-
fetch_url = f"{ncbi_url}efetch.fcgi?db=pubmed&id={ids}&rettype=abstract&retmode=xml&retmax={n}&sort=relevance"
|
| 80 |
-
resp = requests.get(fetch_url)
|
| 81 |
-
try:
|
| 82 |
-
root2 = ET.fromstring(resp.text)
|
| 83 |
-
titles = [a.text for a in root2.findall('.//ArticleTitle')]
|
| 84 |
-
abstracts = [b.text for b in root2.findall('.//AbstractText')]
|
| 85 |
-
except Exception as e:
|
| 86 |
-
print(f"Failed to parse titles/abstracts for query '{q}': {e}")
|
| 87 |
-
titles, abstracts = [], []
|
| 88 |
-
# Sanitize output
|
| 89 |
-
if not abstracts:
|
| 90 |
-
abstracts = [""] * len(titles)
|
| 91 |
-
titles = [re.sub(r"\s+", " ", t).strip() if t else "" for t in titles]
|
| 92 |
-
abstracts = [re.sub(r"\s+", " ", a).strip() if a else "" for a in abstracts]
|
| 93 |
-
return titles, abstracts
|
| 94 |
-
return [], []
|
| 95 |
-
|
| 96 |
-
|
| 97 |
# --- Claim extraction ---
|
| 98 |
indicator_phrases = [
|
| 99 |
"found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
|
|
@@ -147,11 +129,10 @@ def semantic_rerank_claim_abstracts(claim, titles, abstracts, top_k=TOP_ABSTRACT
|
|
| 147 |
idxs = np.argsort(-sims.cpu().numpy())[:top_k]
|
| 148 |
return [titles[i] for i in idxs], [abstracts[i] for i in idxs]
|
| 149 |
|
| 150 |
-
# --- NLI evidence extraction ---
|
| 151 |
-
def extract_evidence_nli(claim,
|
| 152 |
-
sentences = sent_tokenize(abstract)
|
| 153 |
evidence = []
|
| 154 |
-
for sent in
|
| 155 |
encoding = nli_tokenizer(
|
| 156 |
sent, claim,
|
| 157 |
return_tensors='pt',
|
|
@@ -172,7 +153,14 @@ def extract_evidence_nli(claim, title, abstract):
|
|
| 172 |
})
|
| 173 |
return evidence
|
| 174 |
|
| 175 |
-
# --- Summarizer model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
def get_summarizer(model_choice):
|
| 177 |
model_id = model_options[model_choice]
|
| 178 |
if model_id in pipe_cache:
|
|
@@ -249,20 +237,23 @@ def factcheck_app(article_url, summarizer_choice):
|
|
| 249 |
results_html = ""
|
| 250 |
all_results = []
|
| 251 |
for claim in matched_claims:
|
| 252 |
-
titles, abstracts =
|
| 253 |
if not titles:
|
| 254 |
-
results_html += f"<hr><b>Claim:</b> {claim}<br><i>No
|
| 255 |
-
all_results.append({"claim": claim, "summary": "No
|
| 256 |
continue
|
| 257 |
top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts)
|
| 258 |
-
idx_non_top = random.choice([i for i in range(len(titles)) if i not in [titles.index(t) for t in top_titles]]) if len(titles) > len(top_titles) else None
|
| 259 |
evidence_results = []
|
| 260 |
for title, abstract in zip(top_titles, top_abstracts):
|
| 261 |
-
evidence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
evidence_results.append({"title": title, "evidence": evidence})
|
| 263 |
-
if idx_non_top is not None:
|
| 264 |
-
control_ev = extract_evidence_nli(claim, titles[idx_non_top], abstracts[idx_non_top])
|
| 265 |
-
evidence_results.append({"title": f"(Control) {titles[idx_non_top]}", "evidence": control_ev})
|
| 266 |
all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]]
|
| 267 |
summary = summarize_evidence_llm(claim, all_evidence_sentences, summarizer_choice)
|
| 268 |
results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>"
|
|
@@ -273,7 +264,7 @@ def factcheck_app(article_url, summarizer_choice):
|
|
| 273 |
|
| 274 |
description = """
|
| 275 |
<b>What does this app do?</b><br>
|
| 276 |
-
This app extracts key scientific claims from a news article, finds the most relevant
|
| 277 |
<b>How to use it:</b><br>
|
| 278 |
1. Paste the link to a biomedical news article.<br>
|
| 279 |
2. Choose an AI summarizer model below. If you have no special access, use 'TinyLlama' (works for everyone).<br>
|
|
@@ -281,7 +272,7 @@ This app extracts key scientific claims from a news article, finds the most rele
|
|
| 281 |
4. For each claim, you will see:<br>
|
| 282 |
- A plain summary of what research says.<br>
|
| 283 |
- Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
|
| 284 |
-
- The titles of the most relevant
|
| 285 |
<b>Everything is 100% open source and runs on this website—no personal info or cloud API needed.</b>
|
| 286 |
"""
|
| 287 |
|
|
|
|
| 5 |
import gradio as gr
|
| 6 |
import numpy as np
|
| 7 |
import nltk
|
|
|
|
| 8 |
from newspaper import Article
|
| 9 |
from nltk.tokenize import sent_tokenize
|
|
|
|
| 10 |
from sentence_transformers import SentenceTransformer, util
|
| 11 |
import spacy
|
| 12 |
import en_core_sci_lg
|
| 13 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
| 14 |
import torch
|
| 15 |
+
import nltkmodule
|
| 16 |
+
|
| 17 |
+
# Download NLTK punkt if not present
|
| 18 |
+
#nltk.download('punkt')
|
| 19 |
|
| 20 |
# --- Models (load once, globally) ---
|
| 21 |
scispacy = en_core_sci_lg.load()
|
|
|
|
| 29 |
PUBMED_N = 100
|
| 30 |
TOP_ABSTRACTS = 10
|
| 31 |
|
| 32 |
+
# --- Sentence section classifier model (BioBert-PubMed200kRCT) ---
|
| 33 |
+
EVIDENCE_MODEL = "pritamdeka/BioBert-PubMed200kRCT"
|
| 34 |
+
evidence_tokenizer = AutoTokenizer.from_pretrained(EVIDENCE_MODEL)
|
| 35 |
+
evidence_model = AutoModelForSequenceClassification.from_pretrained(EVIDENCE_MODEL)
|
| 36 |
+
label_map = {0: "BACKGROUND", 1: "OBJECTIVE", 2: "METHODS", 3: "RESULTS", 4: "CONCLUSIONS"}
|
| 37 |
+
|
| 38 |
+
def extract_evidence_sentences_from_abstract(abstract, keep_labels=("RESULTS", "CONCLUSIONS")):
|
| 39 |
+
sents = sent_tokenize(abstract)
|
| 40 |
+
evidence_sents = []
|
| 41 |
+
for s in sents:
|
| 42 |
+
inputs = evidence_tokenizer(s, return_tensors="pt", truncation=True, padding=True)
|
| 43 |
+
with torch.no_grad():
|
| 44 |
+
logits = evidence_model(**inputs).logits
|
| 45 |
+
pred = torch.argmax(logits, dim=1).item()
|
| 46 |
+
label = label_map[pred]
|
| 47 |
+
if label in keep_labels:
|
| 48 |
+
evidence_sents.append((label, s))
|
| 49 |
+
return evidence_sents
|
| 50 |
+
|
| 51 |
+
# --- Europe PMC retrieval ---
|
| 52 |
+
def retrieve_europepmc_abstracts_simple(text, n=TOP_ABSTRACTS):
|
| 53 |
+
query = get_keybert_query(text, top_n=7)
|
| 54 |
+
print("Trying Europe PMC query:", query)
|
| 55 |
+
url = f'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query={query}&resulttype=core&format=json&pageSize={n}'
|
| 56 |
+
r = requests.get(url)
|
| 57 |
+
results = r.json().get('resultList', {}).get('result', [])
|
| 58 |
+
titles = [res.get('title', '') for res in results]
|
| 59 |
+
abstracts = [res.get('abstractText', '') for res in results]
|
| 60 |
+
return titles, abstracts
|
| 61 |
|
| 62 |
# --- Utility: get robust keybert-style query ---
|
| 63 |
def get_keybert_query(text, top_n=10):
|
|
|
|
| 76 |
query = " OR ".join(f'"{kw}"' for kw in keywords)
|
| 77 |
return query
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
# --- Claim extraction ---
|
| 80 |
indicator_phrases = [
|
| 81 |
"found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
|
|
|
|
| 129 |
idxs = np.argsort(-sims.cpu().numpy())[:top_k]
|
| 130 |
return [titles[i] for i in idxs], [abstracts[i] for i in idxs]
|
| 131 |
|
| 132 |
+
# --- NLI evidence extraction (run only on results/conclusion sentences) ---
|
| 133 |
+
def extract_evidence_nli(claim, evidence_sentences):
|
|
|
|
| 134 |
evidence = []
|
| 135 |
+
for sent in evidence_sentences:
|
| 136 |
encoding = nli_tokenizer(
|
| 137 |
sent, claim,
|
| 138 |
return_tensors='pt',
|
|
|
|
| 153 |
})
|
| 154 |
return evidence
|
| 155 |
|
| 156 |
+
# --- Summarizer model options ---
|
| 157 |
+
model_options = {
|
| 158 |
+
"Llama-3.2-1B-Instruct (Meta, gated)": "meta-llama/Llama-3.2-1B-Instruct",
|
| 159 |
+
"Gemma-3-1B-it (Google, gated)": "google/gemma-3-1b-it",
|
| 160 |
+
"TinyLlama-1.1B-Chat (Open)": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
| 161 |
+
}
|
| 162 |
+
pipe_cache = {}
|
| 163 |
+
|
| 164 |
def get_summarizer(model_choice):
|
| 165 |
model_id = model_options[model_choice]
|
| 166 |
if model_id in pipe_cache:
|
|
|
|
| 237 |
results_html = ""
|
| 238 |
all_results = []
|
| 239 |
for claim in matched_claims:
|
| 240 |
+
titles, abstracts = retrieve_europepmc_abstracts_simple(claim)
|
| 241 |
if not titles:
|
| 242 |
+
results_html += f"<hr><b>Claim:</b> {claim}<br><i>No relevant abstracts found in Europe PMC.</i><br>"
|
| 243 |
+
all_results.append({"claim": claim, "summary": "No abstracts found.", "evidence": []})
|
| 244 |
continue
|
| 245 |
top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts)
|
|
|
|
| 246 |
evidence_results = []
|
| 247 |
for title, abstract in zip(top_titles, top_abstracts):
|
| 248 |
+
# Extract evidence (results/conclusions) sentences from abstract
|
| 249 |
+
ev_sents = extract_evidence_sentences_from_abstract(abstract)
|
| 250 |
+
# If none found, fallback to all sentences
|
| 251 |
+
if ev_sents:
|
| 252 |
+
sent_list = [s for lbl, s in ev_sents]
|
| 253 |
+
else:
|
| 254 |
+
sent_list = sent_tokenize(abstract)
|
| 255 |
+
evidence = extract_evidence_nli(claim, sent_list)
|
| 256 |
evidence_results.append({"title": title, "evidence": evidence})
|
|
|
|
|
|
|
|
|
|
| 257 |
all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]]
|
| 258 |
summary = summarize_evidence_llm(claim, all_evidence_sentences, summarizer_choice)
|
| 259 |
results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>"
|
|
|
|
| 264 |
|
| 265 |
description = """
|
| 266 |
<b>What does this app do?</b><br>
|
| 267 |
+
This app extracts key scientific claims from a news article, finds the most relevant biomedical research papers using robust keyphrase extraction and semantic reranking, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
|
| 268 |
<b>How to use it:</b><br>
|
| 269 |
1. Paste the link to a biomedical news article.<br>
|
| 270 |
2. Choose an AI summarizer model below. If you have no special access, use 'TinyLlama' (works for everyone).<br>
|
|
|
|
| 272 |
4. For each claim, you will see:<br>
|
| 273 |
- A plain summary of what research says.<br>
|
| 274 |
- Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
|
| 275 |
+
- The titles of the most relevant research articles.<br><br>
|
| 276 |
<b>Everything is 100% open source and runs on this website—no personal info or cloud API needed.</b>
|
| 277 |
"""
|
| 278 |
|