Spaces:

Kartik2204
/

Sentiment-Analysis

Sleeping

App Files Files Community

Kartikay Khosla commited on Oct 8

Commit

3e4638e

1 Parent(s): bfdf5dd

Deploy Streamlit app with Vertex AI Gemini

Browse files

Files changed (2) hide show

app.py +171 -86
requirements.txt +1 -7

app.py CHANGED Viewed

@@ -10,24 +10,41 @@ import torch
 from langdetect import detect
 import streamlit as st
 import io
-from newspaper import Article   # ✅ for URL input
-import concurrent.futures       # ✅ for safe timeout
-# ==== Gemini via API Key (AI Studio) ====
-import google.generativeai as genai
-api_key = os.getenv("GEMINI_API_KEY")
-if not api_key:
-    raise ValueError("❌ Missing GEMINI_API_KEY. Please set it as environment variable or in Hugging Face secrets.")
-# ✅ Configure Gemini with just the API key
-genai.configure(api_key=api_key)
-# ✅ Load Gemini model
-gemini_model = genai.GenerativeModel("gemini-pro")
 # ===============================
-# 🔧 Safe SpaCy + Stanza Downloads
 # ===============================
 def safe_load_spacy():
     try:
@@ -41,24 +58,28 @@ def safe_load_spacy():
 nlp_en = safe_load_spacy()
-# Ensure Stanza models exist
 stanza_dir = os.path.expanduser("~/.stanza_resources")
-if not os.path.exists(stanza_dir):
-    stanza.download('hi')
-    stanza.download('ta')
-stanza.download('hi')
-stanza.download('ta')
-nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
-nlp_ta = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
 # ===============================
-# Language-Aware Pipeline Loader
 # ===============================
 def load_pipelines(language_code):
     lang = language_code.upper()
     device = 0 if torch.cuda.is_available() else -1
     st.write(f"🌍 Language detected: {lang}")
     st.write(f"Device set to use {'cuda:0' if device == 0 else 'cpu'}")
@@ -97,7 +118,10 @@ def load_pipelines(language_code):
 # ===============================
 def read_and_split_articles(file_path):
     doc = docx.Document(file_path)
-    paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
     return paragraphs
 # ===============================
@@ -107,35 +131,38 @@ def read_article_from_url(url):
     article = Article(url)
     article.download()
     article.parse()
-    title = (article.title or "").strip()
-    body = (article.text or "").strip()
-    return f"{title}\n\n{body}".strip()
 # ===============================
-# Filter Neutral
 # ===============================
 def filter_neutral(emotion_results, neutral_threshold=0.75):
-    scores = {r["label"]: round(r["score"], 3)
-              for r in sorted(emotion_results, key=lambda x: x["score"], reverse=True)}
     if "neutral" in scores and scores["neutral"] > neutral_threshold:
         scores.pop("neutral")
     return scores
 # ===============================
-# Sentence Splitter
 # ===============================
 def split_sentences(text, lang):
     if lang == "hi":
-        sentences = re.split(r'।', text)
     elif lang == "ta":
-        sentences = re.split(r'\.', text)
     else:
         doc = nlp_en(text)
-        sentences = [sent.text.strip() for sent in doc.sents]
-    return [s.strip() for s in sentences if s.strip()]
 # ===============================
-# POS Tagger
 # ===============================
 def get_pos_tags(sentence, lang):
     if lang == "en":
@@ -143,10 +170,18 @@ def get_pos_tags(sentence, lang):
         return [(token.text, token.pos_) for token in doc]
     elif lang == "hi":
         doc = nlp_hi(sentence)
-        return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
     elif lang == "ta":
         doc = nlp_ta(sentence)
-        return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
     return []
 # ===============================
@@ -158,43 +193,70 @@ def normalize_scores(scores: dict):
     max_val = max(scores.values())
     if max_val == 0:
         return scores
-    return {k: round(v / max_val, 3) for k, v in scores.items()}
 # ===============================
-# Gemini – Generate Insight (Safe Hard Timeout)
 # ===============================
 def generate_insight(text, emotions, sentiment, level="Paragraph"):
     try:
-        top_emotions = sorted(emotions.items(), key=lambda x: x[1], reverse=True)[:3]
         emo_text = ", ".join([f"{k}: {v}" for k, v in top_emotions]) if top_emotions else "N/A"
-        sent_text = f"{sentiment['label']} ({round(sentiment['score'], 3)})" if sentiment else "N/A"
-        prompt = (
-            f"{level} to analyze:\n\n{text}\n\n"
-            f"Top 3 detected emotions (normalized 0–1): {emo_text}\n"
-            f"Overall sentiment: {sent_text}\n\n"
-            "You are an expert editor. Suggest concrete, content-specific rewrites and improvements "
-            "to increase clarity, engagement, trust, and emotional impact. Keep it actionable and concise. "
-            "Avoid repeating the original text; propose better alternatives."
-        )
-        # ✅ Run Gemini in background, kill after 15s
         with concurrent.futures.ThreadPoolExecutor() as executor:
-            future = executor.submit(lambda: gemini_model.generate_content(prompt))
             try:
-                response = future.result(timeout=15)
             except concurrent.futures.TimeoutError:
-                return top_emotions, "⚠️ Gemini request timed out after 15s."
         if response and getattr(response, "text", None):
-            return top_emotions, response.text.strip()
-        return top_emotions, "⚠️ No insight generated."
     except Exception as e:
         return [], f"⚠️ Insight generation failed: {str(e)}"
 # ===============================
-# Analysis Function
 # ===============================
 def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
     export_rows = []
@@ -203,7 +265,6 @@ def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
     if len(paragraphs) <= 1:
         paragraphs = [p.strip() for p in article_text.split("\n") if p.strip()]
-    # ✅ Debug: show how many paragraphs detected
     st.write(f"📑 Paragraphs detected: {len(paragraphs)}")
     weighted_scores = {}
@@ -215,29 +276,40 @@ def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
         for sentence in sentences:
             emo_results = emotion_pipeline(sentence[:512])[0]
             filtered = filter_neutral(emo_results)
             length = len(sentence.split())
             total_length += length
             for emo, score in filtered.items():
-                weighted_scores[emo] = weighted_scores.get(emo, 0) + score * length
             senti_res = sentiment_pipeline(sentence[:512])[0]
-            all_sentiments.append(max(senti_res, key=lambda x: x["score"]))
     if total_length > 0:
-        weighted_scores = {emo: val / total_length for emo, val in weighted_scores.items()}
         weighted_scores = normalize_scores(weighted_scores)
-        weighted_scores = dict(sorted(weighted_scores.items(), key=lambda x: x[1], reverse=True)[:10])
-    overall_sentiment = max(all_sentiments, key=lambda x: x["score"]) if all_sentiments else {}
     st.subheader("📊 OVERALL (Weighted)")
     st.write("Emotions →", weighted_scores)
     st.write("Sentiment →", overall_sentiment)
     top3_overall, overall_insight = generate_insight(
-        "Entire Article", weighted_scores, overall_sentiment, level="Overall Article"
     )
-    st.write("🔥 Top 3 Emotions (for Gemini) →", dict(top3_overall))
-    st.write("💡 Overall Insight →", overall_insight)
     export_rows.append({
         "Type": "Overall",
@@ -255,24 +327,34 @@ def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
         sentences = split_sentences(para, lang[:2])
         for sentence in sentences:
             results = emotion_pipeline(sentence[:512])[0]
-            filtered = filter_neutral(results, neutral_threshold=0.75)
             for emo, score in filtered.items():
                 para_counter[emo] += score
             senti_res = sentiment_pipeline(sentence[:512])[0]
-            para_sentiments.append(max(senti_res, key=lambda x: x["score"]))
-        para_emotions = dict(sorted(para_counter.items(), key=lambda x: x[1], reverse=True))
         para_emotions = normalize_scores(para_emotions)
-        para_emotions = dict(list(para_emotions.items())[:10])
-        para_sentiment = max(para_sentiments, key=lambda x: x["score"]) if para_sentiments else {}
         st.write(f"\n📑 Paragraph {p_idx}: {para}")
         st.write("Emotions →", para_emotions)
         st.write("Sentiment →", para_sentiment)
-        top3_para, insight = generate_insight(para, para_emotions, para_sentiment, level="Paragraph")
-        st.write("🔥 Top 3 Emotions (for Gemini) →", dict(top3_para))
-        st.write("💡 Insights + Rewrites →", insight)
         export_rows.append({
             "Type": "Paragraph",
@@ -291,6 +373,7 @@ def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
 st.title("📑 Multilingual Text Emotion + Sentiment Analyzer")
 download_top = st.empty()
 uploaded_file = st.file_uploader("Upload a DOCX file", type=["docx"])
 url_input = st.text_input("Or enter an Article URL")
 text_input = st.text_area("Or paste text here")
@@ -298,18 +381,19 @@ text_input = st.text_area("Or paste text here")
 if st.button("🔍 Analyze"):
     with st.spinner("Running analysis... ⏳"):
         if uploaded_file:
-            doc_paras = read_and_split_articles(uploaded_file)
-            text_to_analyze = "\n\n".join(doc_paras)
         elif url_input.strip():
             text_to_analyze = read_article_from_url(url_input)
         elif text_input.strip():
             text_to_analyze = text_input
         else:
-            st.warning("Please upload a DOCX, enter a URL, or paste text to analyze.")
             st.stop()
         detected_lang = detect(text_to_analyze[:200]) if text_to_analyze else "en"
         emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang)
         export_rows = analyze_article(text_to_analyze, detected_lang, emotion_pipeline, sentiment_pipeline)
         df_export = pd.DataFrame(export_rows)
@@ -317,18 +401,19 @@ if st.button("🔍 Analyze"):
         with download_top.container():
             st.download_button(
-                label="⬇️ Download CSV",
-                data=csv,
-                file_name="analysis_results.csv",
-                mime="text/csv",
-                use_container_width=True,
             )
             excel_buffer = io.BytesIO()
             df_export.to_excel(excel_buffer, index=False, engine="xlsxwriter")
             st.download_button(
-                label="⬇️ Download Excel",
-                data=excel_buffer,
-                file_name="analysis_results.xlsx",
-                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-                use_container_width=True,
             )

 from langdetect import detect
 import streamlit as st
 import io
+from newspaper import Article
+import concurrent.futures
+# ===============================
+# 🔑 Vertex AI Setup
+# ===============================
+import vertexai
+from vertexai.preview.generative_models import GenerativeModel
+import json
+import tempfile
+if "GCP_SERVICE_ACCOUNT_JSON" not in os.environ:
+    raise RuntimeError("❌ GCP_SERVICE_ACCOUNT_JSON secret not found in Hugging Face Space")
+# Write the JSON secret into a temp file
+with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as f:
+    f.write(os.environ["GCP_SERVICE_ACCOUNT_JSON"].encode("utf-8"))
+    SERVICE_ACCOUNT_PATH = f.name
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = SERVICE_ACCOUNT_PATH
+PROJECT_ID = "prod-project-jnm-smart-cms"
+REGION = "us-central1"
+vertexai.init(project=PROJECT_ID, location=REGION)
+try:
+    gemini_model = GenerativeModel("gemini-2.5-pro")
+except Exception as e:
+    st.warning(f"⚠️ Falling back to gemini-2.5-flash due to: {e}")
+    gemini_model = GenerativeModel("gemini-2.5-flash")
 # ===============================
+# Safe SpaCy + Stanza Loads
 # ===============================
 def safe_load_spacy():
     try:
 nlp_en = safe_load_spacy()
 stanza_dir = os.path.expanduser("~/.stanza_resources")
+if not os.path.exists(os.path.join(stanza_dir, "hi")):
+    stanza.download("hi")
+if not os.path.exists(os.path.join(stanza_dir, "ta")):
+    stanza.download("ta")
+nlp_hi = stanza.Pipeline("hi", processors="tokenize,pos", use_gpu=torch.cuda.is_available())
+nlp_ta = stanza.Pipeline("ta", processors="tokenize,pos", use_gpu=torch.cuda.is_available())
+# ===============================
+# Streamlit run check
+# ===============================
+if not hasattr(st, "runtime") or not getattr(st.runtime, "exists", lambda: False)():
+    print("\n⚠️ WARNING: Run with `streamlit run app.py` instead of `python app.py`\n")
 # ===============================
+# Load Hugging Face Pipelines
 # ===============================
 def load_pipelines(language_code):
     lang = language_code.upper()
     device = 0 if torch.cuda.is_available() else -1
     st.write(f"🌍 Language detected: {lang}")
     st.write(f"Device set to use {'cuda:0' if device == 0 else 'cpu'}")
 # ===============================
 def read_and_split_articles(file_path):
     doc = docx.Document(file_path)
+    paragraphs = []
+    for para in doc.paragraphs:
+        if para.text.strip():
+            paragraphs.append(para.text.strip())
     return paragraphs
 # ===============================
     article = Article(url)
     article.download()
     article.parse()
+    return f"{article.title.strip()}\n\n{article.text.strip()}"
 # ===============================
+# Filter Neutral Emotions
 # ===============================
 def filter_neutral(emotion_results, neutral_threshold=0.75):
+    sorted_results = sorted(emotion_results, key=lambda x: x["score"], reverse=True)
+    scores = {}
+    for r in sorted_results:
+        scores[r["label"]] = round(r["score"], 3)
     if "neutral" in scores and scores["neutral"] > neutral_threshold:
         scores.pop("neutral")
     return scores
 # ===============================
+# Split Sentences
 # ===============================
 def split_sentences(text, lang):
     if lang == "hi":
+        sentences = re.split(r"।", text)
+        return [s.strip() for s in sentences if s.strip()]
     elif lang == "ta":
+        sentences = re.split(r"\.", text)
+        return [s.strip() for s in sentences if s.strip()]
     else:
         doc = nlp_en(text)
+        return [sent.text.strip() for sent in doc.sents if sent.text.strip()]
 # ===============================
+# POS Tagging
 # ===============================
 def get_pos_tags(sentence, lang):
     if lang == "en":
         return [(token.text, token.pos_) for token in doc]
     elif lang == "hi":
         doc = nlp_hi(sentence)
+        tags = []
+        for sent in doc.sentences:
+            for word in sent.words:
+                tags.append((word.text, word.upos))
+        return tags
     elif lang == "ta":
         doc = nlp_ta(sentence)
+        tags = []
+        for sent in doc.sentences:
+            for word in sent.words:
+                tags.append((word.text, word.upos))
+        return tags
     return []
 # ===============================
     max_val = max(scores.values())
     if max_val == 0:
         return scores
+    normalized = {}
+    for k, v in scores.items():
+        normalized[k] = round(v / max_val, 3)
+    return normalized
 # ===============================
+# Gemini Insight Generation
 # ===============================
 def generate_insight(text, emotions, sentiment, level="Paragraph"):
     try:
+        filtered = {k: v for k, v in emotions.items() if k.lower() != "neutral"}
+        sorted_emotions = sorted(filtered.items(), key=lambda x: x[1], reverse=True)
+        top_emotions = sorted_emotions[:3]
         emo_text = ", ".join([f"{k}: {v}" for k, v in top_emotions]) if top_emotions else "N/A"
+        sent_text = f"{sentiment.get('label','N/A')} ({round(sentiment.get('score',0), 3)})" if sentiment else "N/A"
+        prompt = f"""
+You are an editorial coach.
+Analyze this {level} and propose a rewrite.
+Content:
+{text}
+Detected Top Emotions → {emo_text}
+Detected Sentiment → {sent_text}
+Your Output (concise):
+- 🔥 Suggested Rewrite (≤3 sentences, avoid repetition)
+- 💡 Why it Works (≤2 sentences, tie directly to emotions/sentiment)
+"""
+        def call_model(model):
+            return model.generate_content(prompt)
         with concurrent.futures.ThreadPoolExecutor() as executor:
+            future = executor.submit(lambda: call_model(gemini_model))
             try:
+                response = future.result(timeout=40)
             except concurrent.futures.TimeoutError:
+                try:
+                    flash_model = GenerativeModel("gemini-2.5-flash")
+                    st.warning("⚡ Retrying with Flash due to Pro timeout...")
+                    future = executor.submit(lambda: call_model(flash_model))
+                    response = future.result(timeout=30)
+                except Exception:
+                    return top_emotions, f"⚠️ Gemini request timed out.\n\nDetected Emotions: {emo_text}, Sentiment: {sent_text}"
         if response and getattr(response, "text", None):
+            final_text = (
+                f"🔥 Top 3 Emotions: {emo_text}\n"
+                f"🌓 Sentiment: {sent_text}\n\n"
+                f"{response.text.strip()}"
+            )
+            return top_emotions, final_text
+        else:
+            return top_emotions, f"⚠️ No insight generated.\n\nDetected Emotions: {emo_text}, Sentiment: {sent_text}"
     except Exception as e:
         return [], f"⚠️ Insight generation failed: {str(e)}"
 # ===============================
+# Main Analyzer
 # ===============================
 def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
     export_rows = []
     if len(paragraphs) <= 1:
         paragraphs = [p.strip() for p in article_text.split("\n") if p.strip()]
     st.write(f"📑 Paragraphs detected: {len(paragraphs)}")
     weighted_scores = {}
         for sentence in sentences:
             emo_results = emotion_pipeline(sentence[:512])[0]
             filtered = filter_neutral(emo_results)
             length = len(sentence.split())
             total_length += length
             for emo, score in filtered.items():
+                if emo not in weighted_scores:
+                    weighted_scores[emo] = 0
+                weighted_scores[emo] += score * length
             senti_res = sentiment_pipeline(sentence[:512])[0]
+            best_senti = max(senti_res, key=lambda x: x["score"])
+            all_sentiments.append(best_senti)
     if total_length > 0:
+        for emo in weighted_scores:
+            weighted_scores[emo] = weighted_scores[emo] / total_length
         weighted_scores = normalize_scores(weighted_scores)
+        sorted_scores = sorted(weighted_scores.items(), key=lambda x: x[1], reverse=True)
+        weighted_scores = dict(sorted_scores[:10])
+    if all_sentiments:
+        overall_sentiment = max(all_sentiments, key=lambda x: x["score"])
+    else:
+        overall_sentiment = {}
     st.subheader("📊 OVERALL (Weighted)")
     st.write("Emotions →", weighted_scores)
     st.write("Sentiment →", overall_sentiment)
     top3_overall, overall_insight = generate_insight(
+        article_text, weighted_scores, overall_sentiment, "Overall Article"
     )
+    st.write(overall_insight)
     export_rows.append({
         "Type": "Overall",
         sentences = split_sentences(para, lang[:2])
         for sentence in sentences:
             results = emotion_pipeline(sentence[:512])[0]
+            filtered = filter_neutral(results)
             for emo, score in filtered.items():
                 para_counter[emo] += score
             senti_res = sentiment_pipeline(sentence[:512])[0]
+            best_senti = max(senti_res, key=lambda x: x["score"])
+            para_sentiments.append(best_senti)
+        para_emotions = dict(para_counter)
         para_emotions = normalize_scores(para_emotions)
+        sorted_para = sorted(para_emotions.items(), key=lambda x: x[1], reverse=True)
+        para_emotions = dict(sorted_para[:10])
+        if para_sentiments:
+            para_sentiment = max(para_sentiments, key=lambda x: x["score"])
+        else:
+            para_sentiment = {}
         st.write(f"\n📑 Paragraph {p_idx}: {para}")
         st.write("Emotions →", para_emotions)
         st.write("Sentiment →", para_sentiment)
+        top3_para, insight = generate_insight(
+            para, para_emotions, para_sentiment, "Paragraph"
+        )
+        st.write(insight)
         export_rows.append({
             "Type": "Paragraph",
 st.title("📑 Multilingual Text Emotion + Sentiment Analyzer")
 download_top = st.empty()
 uploaded_file = st.file_uploader("Upload a DOCX file", type=["docx"])
 url_input = st.text_input("Or enter an Article URL")
 text_input = st.text_area("Or paste text here")
 if st.button("🔍 Analyze"):
     with st.spinner("Running analysis... ⏳"):
         if uploaded_file:
+            text_to_analyze = "\n\n".join(read_and_split_articles(uploaded_file))
         elif url_input.strip():
             text_to_analyze = read_article_from_url(url_input)
         elif text_input.strip():
             text_to_analyze = text_input
         else:
+            st.warning("Please provide text input.")
             st.stop()
         detected_lang = detect(text_to_analyze[:200]) if text_to_analyze else "en"
         emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang)
         export_rows = analyze_article(text_to_analyze, detected_lang, emotion_pipeline, sentiment_pipeline)
         df_export = pd.DataFrame(export_rows)
         with download_top.container():
             st.download_button(
+                "⬇️ Download CSV",
+                csv,
+                "analysis_results.csv",
+                "text/csv",
+                use_container_width=True
             )
             excel_buffer = io.BytesIO()
             df_export.to_excel(excel_buffer, index=False, engine="xlsxwriter")
             st.download_button(
+                "⬇️ Download Excel",
+                excel_buffer,
+                "analysis_results.xlsx",
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                use_container_width=True
             )

requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
-# Core app
 streamlit
 pandas
 torch
@@ -11,10 +10,5 @@ openpyxl
 xlsxwriter
 lxml[html_clean]
 newspaper3k==0.2.8
-# Gemini (AI Studio API key mode only)
-google-generativeai>=0.3.0
-# ✅ SpaCy + English model
-spacy>=3.7.0
-https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl

 streamlit
 pandas
 torch
 xlsxwriter
 lxml[html_clean]
 newspaper3k==0.2.8
+google-cloud-aiplatform>=1.66.0