Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import random | |
| import gradio as gr | |
| import requests | |
| import numpy as np | |
| from nltk.tokenize import sent_tokenize | |
| from newspaper import Article | |
| from sentence_transformers import SentenceTransformer, util | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| import torch | |
| # --- Download GGUF model from Hugging Face Hub at startup (if not present) --- | |
| from huggingface_hub import hf_hub_download | |
| GGUF_FILENAME = "gemma-3b-it-Q4_K_M.gguf" | |
| GGUF_REPO = "unsloth/gemma-3n-E4B-it-GGUF" | |
| print("Checking for GGUF model...") | |
| gguf_path = hf_hub_download( | |
| repo_id=GGUF_REPO, | |
| filename=GGUF_FILENAME, | |
| cache_dir="./" | |
| ) | |
| print(f"GGUF model path: {gguf_path}") | |
| # Load Llama GGUF model via llama-cpp-python | |
| from llama_cpp import Llama | |
| llm = Llama( | |
| model_path=gguf_path, | |
| n_ctx=2048, | |
| n_threads=4 # or set to number of CPU cores | |
| ) | |
| # --------- App settings --------- | |
| PUBMED_N = 100 # Number of abstracts to retrieve initially | |
| TOP_ABSTRACTS = 10 # Number of top semantic abstracts to keep per claim | |
| NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI" | |
| SBERT_MODEL_NAME = "pritamdeka/S-BioBert-snli-multinli-stsb" | |
| NLI_LABELS = ['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT'] | |
| # --------- Indicator Phrases for Claim Extraction --------- | |
| indicator_phrases = [ | |
| "found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates", | |
| "revealed", "reveals", "suggests", "suggested", "indicated", "indicates", "reported", "reports", | |
| "was reported", "concluded", "concludes", "conclusion", "authors state", "stated", "data suggest", | |
| "observed", "observes", "study suggests", "study shows", "study found", "researchers found", | |
| "results indicate", "results show", "confirmed", "confirm", "confirming", "point to", | |
| "documented", "document", "evidence of", "evidence suggests", | |
| "associated with", "correlated with", "link between", "linked to", "relationship between", | |
| "was linked", "connected to", "relationship with", "tied to", "association with", | |
| "increase", "increases", "increased", "decrease", "decreases", "decreased", | |
| "greater risk", "lower risk", "higher risk", "reduced risk", "raises the risk", "reduces the risk", | |
| "risk of", "risk for", "likelihood of", "probability of", "chance of", "rate of", "incidence of", | |
| "prevalence of", "mortality", "survival rate", "death rate", "odds of", "number of", "percentage of", "percent of", | |
| "caused by", "causes", "cause", "resulted in", "results in", "leads to", "led to", "contributed to", "responsible for", | |
| "due to", "as a result", "because of", | |
| "randomized controlled trial", "RCT", "clinical trial", "participants", "enrolled", "sample size", "statistically significant", | |
| "compared to", "compared with", "versus", "compared against", | |
| "more than", "less than", "greater than", "lower than", "higher than", "significantly higher", "significantly lower", | |
| "significantly increased", "significantly decreased", "significant difference", | |
| "effect of", "impact of", "influence of", "predictor of", "predicts", "predictive of", "factor for", "determinant of", | |
| "plays a role in", "contributes to", "related to", "affects", "influences", "difference between", | |
| "according to", "a recent study", "researchers from" | |
| ] | |
| # --------- Load models (global, once) --------- | |
| nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME) | |
| nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME) | |
| sbert_model = SentenceTransformer(SBERT_MODEL_NAME) | |
| def extract_claims_pattern(article_text): | |
| sentences = sent_tokenize(article_text) | |
| claims = [ | |
| s for s in sentences | |
| if any(phrase in s.lower() for phrase in indicator_phrases) | |
| or re.search(r"\b\d+(\.\d+)?%?\b", s) | |
| ] | |
| return list(dict.fromkeys(claims)) # deduplicate, preserve order | |
| def match_claims_to_headline(claims, headline, threshold=0.6): | |
| headline_emb = sbert_model.encode([headline]) | |
| claim_embs = sbert_model.encode(claims) | |
| sims = util.pytorch_cos_sim(headline_emb, claim_embs)[0] | |
| matched_claims = [claim for claim, sim in zip(claims, sims) if sim >= threshold] | |
| # fallback: top 3 by similarity | |
| if not matched_claims and claims: | |
| idxs = np.argsort(-sims.cpu().numpy())[:min(3, len(claims))] | |
| matched_claims = [claims[i] for i in idxs] | |
| return matched_claims | |
| def retrieve_pubmed_abstracts(claim, n=PUBMED_N): | |
| ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/' | |
| query = '+'.join(re.findall(r'\w+', claim)) | |
| search_url = f"{ncbi_url}esearch.fcgi?db=pubmed&term={query}&retmax={n}&sort=relevance" | |
| r = requests.get(search_url) | |
| pmids = re.findall(r"<Id>(\d+)</Id>", r.text) | |
| if not pmids: | |
| return [], [] | |
| ids = ','.join(pmids) | |
| fetch_url = f"{ncbi_url}efetch.fcgi?db=pubmed&id={ids}&rettype=xml&retmax={n}" | |
| resp = requests.get(fetch_url) | |
| titles = re.findall(r"<ArticleTitle>(.*?)</ArticleTitle>", resp.text, flags=re.DOTALL) | |
| abstracts = re.findall(r"<AbstractText.*?>(.*?)</AbstractText>", resp.text, flags=re.DOTALL) | |
| if not abstracts: | |
| abstracts = [""] * len(titles) | |
| titles = [re.sub(r"\s+", " ", t).strip() for t in titles] | |
| abstracts = [re.sub(r"\s+", " ", a).strip() for a in abstracts] | |
| return titles, abstracts | |
| def semantic_rerank_claim_abstracts(claim, titles, abstracts, top_k=TOP_ABSTRACTS): | |
| doc_texts = [f"{t}. {a}" for t, a in zip(titles, abstracts)] | |
| doc_embs = sbert_model.encode(doc_texts) | |
| claim_emb = sbert_model.encode([claim]) | |
| sims = util.pytorch_cos_sim(claim_emb, doc_embs)[0] | |
| idxs = np.argsort(-sims.cpu().numpy())[:top_k] | |
| return [titles[i] for i in idxs], [abstracts[i] for i in idxs] | |
| def extract_evidence_nli(claim, title, abstract): | |
| sentences = sent_tokenize(abstract) | |
| evidence = [] | |
| for sent in sentences: | |
| encoding = nli_tokenizer( | |
| sent, claim, | |
| return_tensors='pt', | |
| truncation=True, | |
| max_length=256, | |
| padding=True | |
| ) | |
| with torch.no_grad(): | |
| outputs = nli_model(**encoding) | |
| probs = torch.softmax(outputs.logits, dim=1).cpu().numpy().flatten() | |
| max_idx = probs.argmax() | |
| label = NLI_LABELS[max_idx] | |
| score = float(probs[max_idx]) | |
| evidence.append({ | |
| "sentence": sent, | |
| "label": label, | |
| "score": score | |
| }) | |
| return evidence | |
| def summarize_evidence_llm(claim, evidence_list): | |
| support = [ev['sentence'] for ev in evidence_list if ev['label'] == 'ENTAILMENT'] | |
| contradict = [ev['sentence'] for ev in evidence_list if ev['label'] == 'CONTRADICTION'] | |
| prompt = ( | |
| f"Claim: {claim}\n" | |
| f"Supporting evidence:\n" + ("\n".join(support) if support else "None") + "\n" | |
| f"Contradicting evidence:\n" + ("\n".join(contradict) if contradict else "None") + "\n" | |
| "Explain to a layperson: Is this claim likely true, false, or uncertain based on the evidence above? " | |
| "Give a brief and simple explanation in 2-3 sentences." | |
| ) | |
| try: | |
| output = llm( | |
| prompt, | |
| max_tokens=128, | |
| stop=["\n\n"], | |
| temperature=0.4, | |
| echo=False | |
| ) | |
| summary = output['choices'][0]['text'].strip() | |
| return summary | |
| except Exception as e: | |
| return f"Summary could not be generated: {e}" | |
| def format_evidence_html(evidence_list): | |
| color_map = {"ENTAILMENT":"#e6ffe6", "CONTRADICTION":"#ffe6e6", "NEUTRAL":"#f8f8f8"} | |
| html = "" | |
| for ev in evidence_list: | |
| color = color_map[ev["label"]] | |
| html += ( | |
| f'<div style="background:{color};padding:6px;border-radius:6px;margin-bottom:3px">' | |
| f'<b>{ev["label"]}</b> (confidence {ev["score"]:.2f}): {ev["sentence"]}' | |
| '</div>' | |
| ) | |
| return html | |
| def factcheck_app(article_url): | |
| try: | |
| art = Article(article_url) | |
| art.download() | |
| art.parse() | |
| text = art.text | |
| headline = art.title | |
| except Exception as e: | |
| return f"<b>Error downloading or reading article:</b> {e}", None | |
| claims = extract_claims_pattern(text) | |
| matched_claims = match_claims_to_headline(claims, headline) | |
| if not matched_claims: | |
| return "<b>No check-worthy claims found that match the headline.</b>", None | |
| results_html = "" | |
| all_results = [] | |
| for claim in matched_claims: | |
| titles, abstracts = retrieve_pubmed_abstracts(claim) | |
| if not titles: | |
| results_html += f"<hr><b>Claim:</b> {claim}<br><i>No PubMed results found.</i><br>" | |
| all_results.append({"claim": claim, "summary": "No PubMed results found.", "evidence": []}) | |
| continue | |
| top_titles, top_abstracts = semantic_rerank_claim_abstracts(claim, titles, abstracts) | |
| idx_non_top = random.choice([i for i in range(len(titles)) if i not in [titles.index(t) for t in top_titles]]) if len(titles) > len(top_titles) else None | |
| evidence_results = [] | |
| for title, abstract in zip(top_titles, top_abstracts): | |
| evidence = extract_evidence_nli(claim, title, abstract) | |
| evidence_results.append({"title": title, "evidence": evidence}) | |
| if idx_non_top is not None: | |
| control_ev = extract_evidence_nli(claim, titles[idx_non_top], abstracts[idx_non_top]) | |
| evidence_results.append({"title": f"(Control) {titles[idx_non_top]}", "evidence": control_ev}) | |
| all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]] | |
| summary = summarize_evidence_llm(claim, all_evidence_sentences) | |
| results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>" | |
| for abs_res in evidence_results: | |
| results_html += f"<br><b>Abstract:</b> {abs_res['title']}<br>{format_evidence_html(abs_res['evidence'])}" | |
| all_results.append({"claim": claim, "summary": summary, "evidence": evidence_results}) | |
| return results_html, all_results | |
| description = """ | |
| <b>What does this app do?</b><br> | |
| This app extracts key scientific claims from a news article, finds the most relevant PubMed biomedical research papers, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br> | |
| <b>How to use it:</b><br> | |
| 1. Paste the link to a biomedical news article.<br> | |
| 2. Wait for the results.<br> | |
| 3. For each claim, you will see:<br> | |
| - A plain summary of what research says.<br> | |
| - Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br> | |
| - Links to original PubMed research.<br><br> | |
| <b>Everything is 100% open source and runs on this website—no personal info or cloud API needed.</b> | |
| """ | |
| iface = gr.Interface( | |
| fn=factcheck_app, | |
| inputs=gr.Textbox(lines=2, label="Paste a news article URL"), | |
| outputs=[gr.HTML(label="Fact-Check Results (Summary & Evidence)"), gr.JSON(label="All Results (JSON)")], | |
| title="BioMedical News Fact-Checking & Research Evidence Finder", | |
| description=description, | |
| examples=[["https://www.medicalnewstoday.com/articles/omicron-what-do-we-know-about-the-stealth-variant"]], | |
| allow_flagging="never" | |
| ) | |
| iface.launch(share=False, server_name='0.0.0.0', show_error=True) | |