Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,7 @@ import nltk
|
|
| 8 |
import nltkmodule
|
| 9 |
from newspaper import Article
|
| 10 |
from nltk.tokenize import sent_tokenize
|
|
|
|
| 11 |
from sentence_transformers import SentenceTransformer, util
|
| 12 |
import spacy
|
| 13 |
import en_core_sci_lg
|
|
@@ -52,23 +53,46 @@ def get_keybert_query(text, top_n=10):
|
|
| 52 |
return query
|
| 53 |
|
| 54 |
# --- PubMed retrieval ---
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
#return [], []
|
| 72 |
|
| 73 |
# --- Claim extraction ---
|
| 74 |
indicator_phrases = [
|
|
|
|
| 8 |
import nltkmodule
|
| 9 |
from newspaper import Article
|
| 10 |
from nltk.tokenize import sent_tokenize
|
| 11 |
+
import xml.etree.ElementTree as ET
|
| 12 |
from sentence_transformers import SentenceTransformer, util
|
| 13 |
import spacy
|
| 14 |
import en_core_sci_lg
|
|
|
|
| 53 |
return query
|
| 54 |
|
| 55 |
# --- PubMed retrieval ---
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def retrieve_pubmed_abstracts_simple(text, n=100, fallback_headline=None):
|
| 59 |
+
query = get_keybert_query(text, top_n=7)
|
| 60 |
+
if not query or query.strip() == '""':
|
| 61 |
+
query = fallback_headline
|
| 62 |
+
print("Trying PubMed query:", query)
|
| 63 |
ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
| 64 |
+
tried_queries = [q for q in [query, fallback_headline, text] if q]
|
| 65 |
+
|
| 66 |
+
for q in tried_queries:
|
| 67 |
+
# Always request XML, never parse as JSON or HTML
|
| 68 |
+
search_url = f"{ncbi_url}esearch.fcgi?db=pubmed&term={q}&retmax={n}&sort=relevance&retmode=xml"
|
| 69 |
+
r = requests.get(search_url)
|
| 70 |
+
try:
|
| 71 |
+
root = ET.fromstring(r.text)
|
| 72 |
+
pmids = [el.text for el in root.findall('.//Id')]
|
| 73 |
+
except Exception as e:
|
| 74 |
+
print(f"Failed to parse PMIDs for query '{q}': {e}")
|
| 75 |
+
pmids = []
|
| 76 |
+
print(f"Query: {q} => {len(pmids)} PMIDs")
|
| 77 |
+
if pmids:
|
| 78 |
+
ids = ','.join(pmids)
|
| 79 |
+
fetch_url = f"{ncbi_url}efetch.fcgi?db=pubmed&id={ids}&rettype=abstract&retmode=xml&retmax={n}&sort=relevance"
|
| 80 |
+
resp = requests.get(fetch_url)
|
| 81 |
+
try:
|
| 82 |
+
root2 = ET.fromstring(resp.text)
|
| 83 |
+
titles = [a.text for a in root2.findall('.//ArticleTitle')]
|
| 84 |
+
abstracts = [b.text for b in root2.findall('.//AbstractText')]
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"Failed to parse titles/abstracts for query '{q}': {e}")
|
| 87 |
+
titles, abstracts = [], []
|
| 88 |
+
# Sanitize output
|
| 89 |
+
if not abstracts:
|
| 90 |
+
abstracts = [""] * len(titles)
|
| 91 |
+
titles = [re.sub(r"\s+", " ", t).strip() if t else "" for t in titles]
|
| 92 |
+
abstracts = [re.sub(r"\s+", " ", a).strip() if a else "" for a in abstracts]
|
| 93 |
+
return titles, abstracts
|
| 94 |
+
return [], []
|
| 95 |
|
|
|
|
| 96 |
|
| 97 |
# --- Claim extraction ---
|
| 98 |
indicator_phrases = [
|