Spaces:

Kartik2204
/

Sentiment-Analysis

Sleeping

App Files Files Community

Kartikay Khosla commited on Sep 19

Commit

14d6a4f

1 Parent(s): 149d94e

Update app.py and requirements.txt with URL support and emotion filter

Browse files

Files changed (5) hide show

.DS_Store +0 -0
SAURL +313 -0
SAURL.py +313 -0
app copy.py +295 -0
requirements.txt +4 -1

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

SAURL ADDED Viewed

	@@ -0,0 +1,313 @@

+import os
+import spacy
+import stanza
+import pandas as pd
+import re
+import docx
+from collections import Counter
+import stanza
+from transformers import pipeline
+import torch
+from langdetect import detect
+import streamlit as st
+import io
+from newspaper import Article   # ✅ for URL input
+# ===============================
+# 🔧 Safe SpaCy + Stanza Downloads
+# ===============================
+def safe_load_spacy():
+    try:
+        return spacy.load("en_core_web_trf")
+    except OSError:
+        try:
+            return spacy.load("en_core_web_sm")
+        except OSError:
+            os.system("python -m spacy download en_core_web_sm")
+            return spacy.load("en_core_web_sm")
+nlp_en = safe_load_spacy()
+stanza_dir = os.path.expanduser("~/.stanza_resources")
+if not os.path.exists(stanza_dir):
+    stanza.download('hi')
+    stanza.download('ta')
+stanza.download('hi')
+stanza.download('ta')
+nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
+nlp_ta = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
+# ===============================
+# Language-Aware Pipeline Loader
+# ===============================
+def load_pipelines(language_code):
+    lang = language_code.upper()
+    device = 0 if torch.cuda.is_available() else -1
+    st.write(f"🌍 Language detected: {lang}")
+    st.write(f"Device set to use {'cuda:0' if device == 0 else 'cpu'}")
+    if lang == "EN":
+        emo_model = "SamLowe/roberta-base-go_emotions"
+    elif lang in ["HI", "TA"]:
+        emo_model = "bhadresh-savani/bert-base-go-emotion"
+    else:
+        emo_model = "SamLowe/roberta-base-go_emotions"
+    emotion_pipeline = pipeline(
+        "text-classification",
+        model=emo_model,
+        tokenizer=emo_model,
+        return_all_scores=True,
+        device=device
+    )
+    if lang == "EN":
+        sent_model = "distilbert-base-uncased-finetuned-sst-2-english"
+    else:
+        sent_model = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"
+    sentiment_pipeline = pipeline(
+        "text-classification",
+        model=sent_model,
+        tokenizer=sent_model,
+        return_all_scores=True,
+        device=device
+    )
+    return emotion_pipeline, sentiment_pipeline
+# ===============================
+# DOCX Reader – keep paras separate
+# ===============================
+def read_and_split_articles(file_path):
+    doc = docx.Document(file_path)
+    paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
+    return paragraphs
+# ===============================
+# URL Reader – title + main body
+# ===============================
+def read_article_from_url(url):
+    article = Article(url)
+    article.download()
+    article.parse()
+    title = article.title.strip()
+    body = article.text.strip()
+    full_text = f"{title}\n\n{body}"
+    return full_text
+# ===============================
+# Filter Neutral
+# ===============================
+def filter_neutral(emotion_results, neutral_threshold=0.75):
+    scores = {r["label"]: round(r["score"], 3)
+              for r in sorted(emotion_results, key=lambda x: x["score"], reverse=True)}
+    if "neutral" in scores and scores["neutral"] > neutral_threshold:
+        scores.pop("neutral")
+    return scores
+# ===============================
+# Sentence Splitter
+# ===============================
+def split_sentences(text, lang):
+    if lang == "hi":
+        sentences = re.split(r'।', text)
+    elif lang == "ta":
+        sentences = re.split(r'\.', text)
+    else:
+        doc = nlp_en(text)
+        sentences = [sent.text.strip() for sent in doc.sents]
+    return [s.strip() for s in sentences if s.strip()]
+# ===============================
+# POS Tagger
+# ===============================
+def get_pos_tags(sentence, lang):
+    if lang == "en":
+        doc = nlp_en(sentence)
+        return [(token.text, token.pos_) for token in doc]
+    elif lang == "hi":
+        doc = nlp_hi(sentence)
+        return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
+    elif lang == "ta":
+        doc = nlp_ta(sentence)
+        return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
+    else:
+        return []
+# ===============================
+# Analysis Function
+# ===============================
+def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
+    results_summary = []
+    export_rows = []
+    para_counters = []
+    emotion_to_sentences = {}
+    paragraphs = [p.strip() for p in article_text.split("\n\n") if p.strip()]
+    if len(paragraphs) <= 1:
+        paragraphs = [p.strip() for p in article_text.split("\n") if p.strip()]
+    # Weighted overall results
+    weighted_scores = {}
+    total_length = 0
+    all_sentiments = []
+    for para in paragraphs:
+        sentences = split_sentences(para, lang[:2])
+        for sentence in sentences:
+            emo_results = emotion_pipeline(sentence[:512])[0]
+            filtered = filter_neutral(emo_results)
+            length = len(sentence.split())
+            total_length += length
+            for emo, score in filtered.items():
+                weighted_scores[emo] = weighted_scores.get(emo, 0) + score * length
+            sentiment_results = sentiment_pipeline(sentence[:512])[0]
+            all_sentiments.append(max(sentiment_results, key=lambda x: x["score"]))
+    if total_length > 0:
+        weighted_scores = {emo: round(val / total_length, 3) for emo, val in weighted_scores.items()}
+    overall_sentiment = max(all_sentiments, key=lambda x: x["score"]) if all_sentiments else {}
+    st.subheader("📊 OVERALL (Weighted)")
+    st.write("Emotions →", weighted_scores)
+    st.write("Sentiment →", overall_sentiment)
+    export_rows.append({
+        "Type": "Overall",
+        "Text": "Weighted across article",
+        "Emotions": weighted_scores,
+        "Sentiment": overall_sentiment
+    })
+    # Paragraph-level
+    for p_idx, para in enumerate(paragraphs, start=1):
+        para_counter = Counter()
+        sentences = split_sentences(para, lang[:2])
+        for sentence in sentences:
+            results = emotion_pipeline(sentence[:512])[0]
+            filtered = filter_neutral(results, neutral_threshold=0.75)
+            for emo, score in filtered.items():
+                para_counter[emo] += score
+                if emo not in emotion_to_sentences:
+                    emotion_to_sentences[emo] = []
+                if emo in sorted(filtered, key=filtered.get, reverse=True)[:5]:
+                    emotion_to_sentences[emo].append(f"(Para {p_idx}) {sentence}")
+        para_counters.append((para, dict(sorted(para_counter.items(), key=lambda x:x[1], reverse=True))))
+        st.write(f"\n📑 Paragraph {p_idx}: {para}")
+        st.write("Emotions →", para_counters[-1][1])
+        export_rows.append({
+            "Type": "Paragraph",
+            "Text": para,
+            "Emotions": para_counters[-1][1],
+            "Sentiment": ""
+        })
+    # Sentence-level
+    st.subheader("📝 SENTENCES")
+    for para in paragraphs:
+        sentences = split_sentences(para, lang[:2])
+        for sentence in sentences:
+            pos_tags = get_pos_tags(sentence, lang[:2])
+            results = emotion_pipeline(sentence[:512])[0]
+            filtered = filter_neutral(results, neutral_threshold=0.75)
+            sentiment_results = sentiment_pipeline(sentence[:512])[0]
+            best_sentiment = max(sentiment_results, key=lambda x: x["score"])
+            results_summary.append({
+                "sentence": sentence,
+                "pos_tags": pos_tags,
+                "emotions": filtered,
+                "sentiment": best_sentiment
+            })
+            st.write(f"Sentence: {sentence}")
+            st.write(f"POS Tags → {pos_tags}")
+            st.write(f"Emotions → {filtered}")
+            st.write(f"Sentiment → {best_sentiment['label']} ({round(best_sentiment['score'],4)})\n")
+            for emo in sorted(filtered, key=filtered.get, reverse=True)[:5]:
+                if emo not in emotion_to_sentences:
+                    emotion_to_sentences[emo] = []
+                emotion_to_sentences[emo].append(f"(Sentence) {sentence}")
+            export_rows.append({
+                "Type": "Sentence",
+                "Text": sentence,
+                "Emotions": filtered,
+                "Sentiment": best_sentiment
+            })
+    return results_summary, export_rows, emotion_to_sentences
+# ===============================
+# Streamlit App
+# ===============================
+st.title("📑 Multilingual Text Emotion + Sentiment Analyzer")
+uploaded_file = st.file_uploader("Upload a DOCX file", type=["docx"])
+url_input = st.text_input("Or enter an Article URL")
+text_input = st.text_area("Or paste text here")
+if st.button("🔍 Analyze"):
+    with st.spinner("Running analysis... ⏳"):
+        if uploaded_file:
+            articles = read_and_split_articles(uploaded_file)
+            text_to_analyze = "\n\n".join(articles)
+        elif url_input.strip():
+            text_to_analyze = read_article_from_url(url_input)
+        elif text_input.strip():
+            text_to_analyze = text_input
+        else:
+            st.warning("Please upload a DOCX, enter a URL, or paste text to analyze.")
+            st.stop()
+        detected_lang = detect(text_to_analyze[:200]) if text_to_analyze else "en"
+        emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang)
+        results, export_rows, emotion_to_sentences = analyze_article(
+            text_to_analyze, detected_lang, emotion_pipeline, sentiment_pipeline
+        )
+        # ✅ Download buttons FIRST
+        df_export = pd.DataFrame(export_rows)
+        csv = df_export.to_csv(index=False).encode("utf-8")
+        st.download_button(
+            label="⬇️ Download CSV",
+            data=csv,
+            file_name="analysis_results.csv",
+            mime="text/csv",
+        )
+        excel_buffer = io.BytesIO()
+        df_export.to_excel(excel_buffer, index=False, engine="xlsxwriter")
+        st.download_button(
+            label="⬇️ Download Excel",
+            data=excel_buffer,
+            file_name="analysis_results.xlsx",
+            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        )
+        # ✅ Emotion filter tabs at the end
+        if emotion_to_sentences and len(emotion_to_sentences) > 0:
+            st.subheader("🎭 Explore by Emotion (Top 5 only)")
+            emotion_list = list(emotion_to_sentences.keys())
+            tabs = st.tabs(emotion_list)
+            for idx, emo in enumerate(emotion_list):
+                with tabs[idx]:
+                    st.write(f"### 🔹 {emo.upper()}")
+                    for text in emotion_to_sentences[emo]:
+                        st.write(f"- {text}")
+        else:
+            st.info("No emotions strong enough to show in Top 5 filters.")

SAURL.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import os
+import spacy
+import stanza
+import pandas as pd
+import re
+import docx
+from collections import Counter
+import stanza
+from transformers import pipeline
+import torch
+from langdetect import detect
+import streamlit as st
+import io
+from newspaper import Article   # ✅ for URL input
+# ===============================
+# 🔧 Safe SpaCy + Stanza Downloads
+# ===============================
+def safe_load_spacy():
+    try:
+        return spacy.load("en_core_web_trf")
+    except OSError:
+        try:
+            return spacy.load("en_core_web_sm")
+        except OSError:
+            os.system("python -m spacy download en_core_web_sm")
+            return spacy.load("en_core_web_sm")
+nlp_en = safe_load_spacy()
+stanza_dir = os.path.expanduser("~/.stanza_resources")
+if not os.path.exists(stanza_dir):
+    stanza.download('hi')
+    stanza.download('ta')
+stanza.download('hi')
+stanza.download('ta')
+nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
+nlp_ta = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
+# ===============================
+# Language-Aware Pipeline Loader
+# ===============================
+def load_pipelines(language_code):
+    lang = language_code.upper()
+    device = 0 if torch.cuda.is_available() else -1
+    st.write(f"🌍 Language detected: {lang}")
+    st.write(f"Device set to use {'cuda:0' if device == 0 else 'cpu'}")
+    if lang == "EN":
+        emo_model = "SamLowe/roberta-base-go_emotions"
+    elif lang in ["HI", "TA"]:
+        emo_model = "bhadresh-savani/bert-base-go-emotion"
+    else:
+        emo_model = "SamLowe/roberta-base-go_emotions"
+    emotion_pipeline = pipeline(
+        "text-classification",
+        model=emo_model,
+        tokenizer=emo_model,
+        return_all_scores=True,
+        device=device
+    )
+    if lang == "EN":
+        sent_model = "distilbert-base-uncased-finetuned-sst-2-english"
+    else:
+        sent_model = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"
+    sentiment_pipeline = pipeline(
+        "text-classification",
+        model=sent_model,
+        tokenizer=sent_model,
+        return_all_scores=True,
+        device=device
+    )
+    return emotion_pipeline, sentiment_pipeline
+# ===============================
+# DOCX Reader – keep paras separate
+# ===============================
+def read_and_split_articles(file_path):
+    doc = docx.Document(file_path)
+    paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
+    return paragraphs
+# ===============================
+# URL Reader – title + main body
+# ===============================
+def read_article_from_url(url):
+    article = Article(url)
+    article.download()
+    article.parse()
+    title = article.title.strip()
+    body = article.text.strip()
+    full_text = f"{title}\n\n{body}"
+    return full_text
+# ===============================
+# Filter Neutral
+# ===============================
+def filter_neutral(emotion_results, neutral_threshold=0.75):
+    scores = {r["label"]: round(r["score"], 3)
+              for r in sorted(emotion_results, key=lambda x: x["score"], reverse=True)}
+    if "neutral" in scores and scores["neutral"] > neutral_threshold:
+        scores.pop("neutral")
+    return scores
+# ===============================
+# Sentence Splitter
+# ===============================
+def split_sentences(text, lang):
+    if lang == "hi":
+        sentences = re.split(r'।', text)
+    elif lang == "ta":
+        sentences = re.split(r'\.', text)
+    else:
+        doc = nlp_en(text)
+        sentences = [sent.text.strip() for sent in doc.sents]
+    return [s.strip() for s in sentences if s.strip()]
+# ===============================
+# POS Tagger
+# ===============================
+def get_pos_tags(sentence, lang):
+    if lang == "en":
+        doc = nlp_en(sentence)
+        return [(token.text, token.pos_) for token in doc]
+    elif lang == "hi":
+        doc = nlp_hi(sentence)
+        return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
+    elif lang == "ta":
+        doc = nlp_ta(sentence)
+        return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
+    else:
+        return []
+# ===============================
+# Analysis Function
+# ===============================
+def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
+    results_summary = []
+    export_rows = []
+    para_counters = []
+    emotion_to_sentences = {}
+    paragraphs = [p.strip() for p in article_text.split("\n\n") if p.strip()]
+    if len(paragraphs) <= 1:
+        paragraphs = [p.strip() for p in article_text.split("\n") if p.strip()]
+    # Weighted overall results
+    weighted_scores = {}
+    total_length = 0
+    all_sentiments = []
+    for para in paragraphs:
+        sentences = split_sentences(para, lang[:2])
+        for sentence in sentences:
+            emo_results = emotion_pipeline(sentence[:512])[0]
+            filtered = filter_neutral(emo_results)
+            length = len(sentence.split())
+            total_length += length
+            for emo, score in filtered.items():
+                weighted_scores[emo] = weighted_scores.get(emo, 0) + score * length
+            sentiment_results = sentiment_pipeline(sentence[:512])[0]
+            all_sentiments.append(max(sentiment_results, key=lambda x: x["score"]))
+    if total_length > 0:
+        weighted_scores = {emo: round(val / total_length, 3) for emo, val in weighted_scores.items()}
+    overall_sentiment = max(all_sentiments, key=lambda x: x["score"]) if all_sentiments else {}
+    st.subheader("📊 OVERALL (Weighted)")
+    st.write("Emotions →", weighted_scores)
+    st.write("Sentiment →", overall_sentiment)
+    export_rows.append({
+        "Type": "Overall",
+        "Text": "Weighted across article",
+        "Emotions": weighted_scores,
+        "Sentiment": overall_sentiment
+    })
+    # Paragraph-level
+    for p_idx, para in enumerate(paragraphs, start=1):
+        para_counter = Counter()
+        sentences = split_sentences(para, lang[:2])
+        for sentence in sentences:
+            results = emotion_pipeline(sentence[:512])[0]
+            filtered = filter_neutral(results, neutral_threshold=0.75)
+            for emo, score in filtered.items():
+                para_counter[emo] += score
+                if emo not in emotion_to_sentences:
+                    emotion_to_sentences[emo] = []
+                if emo in sorted(filtered, key=filtered.get, reverse=True)[:5]:
+                    emotion_to_sentences[emo].append(f"(Para {p_idx}) {sentence}")
+        para_counters.append((para, dict(sorted(para_counter.items(), key=lambda x:x[1], reverse=True))))
+        st.write(f"\n📑 Paragraph {p_idx}: {para}")
+        st.write("Emotions →", para_counters[-1][1])
+        export_rows.append({
+            "Type": "Paragraph",
+            "Text": para,
+            "Emotions": para_counters[-1][1],
+            "Sentiment": ""
+        })
+    # Sentence-level
+    st.subheader("📝 SENTENCES")
+    for para in paragraphs:
+        sentences = split_sentences(para, lang[:2])
+        for sentence in sentences:
+            pos_tags = get_pos_tags(sentence, lang[:2])
+            results = emotion_pipeline(sentence[:512])[0]
+            filtered = filter_neutral(results, neutral_threshold=0.75)
+            sentiment_results = sentiment_pipeline(sentence[:512])[0]
+            best_sentiment = max(sentiment_results, key=lambda x: x["score"])
+            results_summary.append({
+                "sentence": sentence,
+                "pos_tags": pos_tags,
+                "emotions": filtered,
+                "sentiment": best_sentiment
+            })
+            st.write(f"Sentence: {sentence}")
+            st.write(f"POS Tags → {pos_tags}")
+            st.write(f"Emotions → {filtered}")
+            st.write(f"Sentiment → {best_sentiment['label']} ({round(best_sentiment['score'],4)})\n")
+            for emo in sorted(filtered, key=filtered.get, reverse=True)[:5]:
+                if emo not in emotion_to_sentences:
+                    emotion_to_sentences[emo] = []
+                emotion_to_sentences[emo].append(f"(Sentence) {sentence}")
+            export_rows.append({
+                "Type": "Sentence",
+                "Text": sentence,
+                "Emotions": filtered,
+                "Sentiment": best_sentiment
+            })
+    return results_summary, export_rows, emotion_to_sentences
+# ===============================
+# Streamlit App
+# ===============================
+st.title("📑 Multilingual Text Emotion + Sentiment Analyzer")
+uploaded_file = st.file_uploader("Upload a DOCX file", type=["docx"])
+url_input = st.text_input("Or enter an Article URL")
+text_input = st.text_area("Or paste text here")
+if st.button("🔍 Analyze"):
+    with st.spinner("Running analysis... ⏳"):
+        if uploaded_file:
+            articles = read_and_split_articles(uploaded_file)
+            text_to_analyze = "\n\n".join(articles)
+        elif url_input.strip():
+            text_to_analyze = read_article_from_url(url_input)
+        elif text_input.strip():
+            text_to_analyze = text_input
+        else:
+            st.warning("Please upload a DOCX, enter a URL, or paste text to analyze.")
+            st.stop()
+        detected_lang = detect(text_to_analyze[:200]) if text_to_analyze else "en"
+        emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang)
+        results, export_rows, emotion_to_sentences = analyze_article(
+            text_to_analyze, detected_lang, emotion_pipeline, sentiment_pipeline
+        )
+        # ✅ Download buttons FIRST
+        df_export = pd.DataFrame(export_rows)
+        csv = df_export.to_csv(index=False).encode("utf-8")
+        st.download_button(
+            label="⬇️ Download CSV",
+            data=csv,
+            file_name="analysis_results.csv",
+            mime="text/csv",
+        )
+        excel_buffer = io.BytesIO()
+        df_export.to_excel(excel_buffer, index=False, engine="xlsxwriter")
+        st.download_button(
+            label="⬇️ Download Excel",
+            data=excel_buffer,
+            file_name="analysis_results.xlsx",
+            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        )
+        # ✅ Emotion filter tabs at the end
+        if emotion_to_sentences and len(emotion_to_sentences) > 0:
+            st.subheader("🎭 Explore by Emotion (Top 5 only)")
+            emotion_list = list(emotion_to_sentences.keys())
+            tabs = st.tabs(emotion_list)
+            for idx, emo in enumerate(emotion_list):
+                with tabs[idx]:
+                    st.write(f"### 🔹 {emo.upper()}")
+                    for text in emotion_to_sentences[emo]:
+                        st.write(f"- {text}")
+        else:
+            st.info("No emotions strong enough to show in Top 5 filters.")

app copy.py ADDED Viewed

	@@ -0,0 +1,295 @@

+import os
+import spacy
+import stanza
+# ===============================
+# 🔧 Safe SpaCy + Stanza Downloads
+# ===============================
+def safe_load_spacy():
+    try:
+        return spacy.load("en_core_web_trf")
+    except OSError:
+        try:
+            return spacy.load("en_core_web_sm")
+        except OSError:
+            os.system("python -m spacy download en_core_web_sm")
+            return spacy.load("en_core_web_sm")
+# ✅ Initialize English SpaCy safely
+nlp_en = safe_load_spacy()
+# Ensure Stanza models exist
+stanza_dir = os.path.expanduser("~/.stanza_resources")
+if not os.path.exists(stanza_dir):
+    stanza.download('hi')
+    stanza.download('ta')
+# ===============================
+# 1️⃣ Imports
+# ===============================
+import pandas as pd
+import re
+import docx
+from collections import Counter
+import stanza
+from transformers import pipeline
+import torch
+from langdetect import detect
+import streamlit as st
+import io
+# ===============================
+# 2️⃣ Pre-download Stanza models
+# ===============================
+stanza.download('hi')
+stanza.download('ta')
+# ===============================
+# 3️⃣ Initialize Stanza for Hindi/Tamil
+# ===============================
+nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
+nlp_ta = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
+# ===============================
+# 4️⃣ Language-Aware Pipeline Loader
+# ===============================
+def load_pipelines(language_code):
+    lang = language_code.upper()
+    device = 0 if torch.cuda.is_available() else -1
+    st.write(f"🌍 Language detected: {lang}")
+    st.write(f"Device set to use {'cuda:0' if device == 0 else 'cpu'}")
+    # Emotion model
+    if lang == "EN":
+        emo_model = "SamLowe/roberta-base-go_emotions"
+    elif lang in ["HI", "TA"]:
+        emo_model = "bhadresh-savani/bert-base-go-emotion"
+    else:
+        emo_model = "SamLowe/roberta-base-go_emotions"
+    emotion_pipeline = pipeline(
+        "text-classification",
+        model=emo_model,
+        tokenizer=emo_model,
+        return_all_scores=True,
+        device=device
+    )
+    # Sentiment model
+    if lang == "EN":
+        sent_model = "distilbert-base-uncased-finetuned-sst-2-english"
+    else:
+        sent_model = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"
+    sentiment_pipeline = pipeline(
+        "text-classification",
+        model=sent_model,
+        tokenizer=sent_model,
+        return_all_scores=True,
+        device=device
+    )
+    return emotion_pipeline, sentiment_pipeline
+# ===============================
+# 5️⃣ Read DOCX and split articles
+# ===============================
+def read_and_split_articles(file_path):
+    doc = docx.Document(file_path)
+    paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
+    return paragraphs  # ✅ Each docx paragraph separately
+# ===============================
+# 6️⃣ Utility – Filter Neutral
+# ===============================
+def filter_neutral(emotion_results, neutral_threshold=0.75):
+    scores = {r["label"]: round(r["score"], 3)
+              for r in sorted(emotion_results, key=lambda x: x["score"], reverse=True)}
+    if "neutral" in scores and scores["neutral"] > neutral_threshold:
+        scores.pop("neutral")
+    return scores
+# ===============================
+# 7️⃣ Sentence Splitter
+# ===============================
+def split_sentences(text, lang):
+    if lang == "hi":
+        sentences = re.split(r'।', text)
+    elif lang == "ta":
+        sentences = re.split(r'\.', text)
+    else:
+        doc = nlp_en(text)
+        sentences = [sent.text.strip() for sent in doc.sents]
+    return [s.strip() for s in sentences if s.strip()]
+# ===============================
+# 8️⃣ PoS Tagger
+# ===============================
+def get_pos_tags(sentence, lang):
+    if lang == "en":
+        doc = nlp_en(sentence)
+        return [(token.text, token.pos_) for token in doc]
+    elif lang == "hi":
+        doc = nlp_hi(sentence)
+        return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
+    elif lang == "ta":
+        doc = nlp_ta(sentence)
+        return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
+    else:
+        return []
+# ===============================
+# 9️⃣ Analysis Function
+# ===============================
+def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline, normalize_paragraphs):
+    results_summary = []
+    export_rows = []
+    para_counters = []
+    article_counter = Counter()
+    paragraphs = [p.strip() for p in article_text.split("\n\n") if p.strip()]
+    # -------------------------------
+    # ✅ Weighted Overall results
+    weighted_scores = {}
+    total_length = 0
+    all_sentiments = []
+    for para in paragraphs:
+        sentences = split_sentences(para, lang[:2])
+        for sentence in sentences:
+            emo_results = emotion_pipeline(sentence[:512])[0]
+            filtered = filter_neutral(emo_results)
+            length = len(sentence.split())
+            total_length += length
+            for emo, score in filtered.items():
+                weighted_scores[emo] = weighted_scores.get(emo, 0) + score * length
+            sentiment_results = sentiment_pipeline(sentence[:512])[0]
+            all_sentiments.append(max(sentiment_results, key=lambda x: x["score"]))
+    if total_length > 0:
+        weighted_scores = {emo: round(val / total_length, 3) for emo, val in weighted_scores.items()}
+    overall_sentiment = max(all_sentiments, key=lambda x: x["score"]) if all_sentiments else {}
+    st.subheader("📊 OVERALL (Weighted)")
+    st.write("Emotions →", weighted_scores)
+    st.write("Sentiment →", overall_sentiment)
+    export_rows.append({
+        "Type": "Overall",
+        "Text": "Weighted across article",
+        "Emotions": weighted_scores,
+        "Sentiment": overall_sentiment
+    })
+    # -------------------------------
+    # Paragraph-level
+    for p_idx, para in enumerate(paragraphs, start=1):
+        para_counter = Counter()
+        sentences = split_sentences(para, lang[:2])
+        for sentence in sentences:
+            results = emotion_pipeline(sentence[:512])[0]
+            filtered = filter_neutral(results, neutral_threshold=0.75)
+            for emo, score in filtered.items():
+                para_counter[emo] += score
+        if normalize_paragraphs:
+            # ✅ Normalize scores so they sum ≤ 1
+            total = sum(para_counter.values())
+            if total > 0:
+                para_counter = {emo: round(val / total, 3) for emo, val in para_counter.items()}
+        para_counters.append((para, dict(sorted(para_counter.items(), key=lambda x:x[1], reverse=True))))
+        st.write(f"\n📑 Paragraph {p_idx}: {para}")
+        st.write("Emotions →", para_counters[-1][1])
+        export_rows.append({
+            "Type": "Paragraph",
+            "Text": para,
+            "Emotions": para_counters[-1][1],
+            "Sentiment": ""
+        })
+    # -------------------------------
+    # Sentence-level
+    st.subheader("📝 SENTENCES")
+    for para in paragraphs:
+        sentences = split_sentences(para, lang[:2])
+        for sentence in sentences:
+            pos_tags = get_pos_tags(sentence, lang[:2])
+            results = emotion_pipeline(sentence[:512])[0]
+            filtered = filter_neutral(results, neutral_threshold=0.75)
+            sentiment_results = sentiment_pipeline(sentence[:512])[0]
+            best_sentiment = max(sentiment_results, key=lambda x: x["score"])
+            results_summary.append({
+                "sentence": sentence,
+                "pos_tags": pos_tags,
+                "emotions": filtered,
+                "sentiment": best_sentiment
+            })
+            st.write(f"Sentence: {sentence}")
+            st.write(f"POS Tags → {pos_tags}")
+            st.write(f"Emotions → {filtered}")
+            st.write(f"Sentiment → {best_sentiment['label']} ({round(best_sentiment['score'],4)})\n")
+            export_rows.append({
+                "Type": "Sentence",
+                "Text": sentence,
+                "Emotions": filtered,
+                "Sentiment": best_sentiment
+            })
+    return results_summary, export_rows
+# ===============================
+# 🔟 Streamlit App
+# ===============================
+st.title("📑 Multilingual Text Emotion + Sentiment Analyzer")
+uploaded_file = st.file_uploader("Upload a DOCX file", type=["docx"])
+text_input = st.text_area("Or paste text here")
+# ✅ Checkbox for paragraph normalization
+normalize_paragraphs = st.checkbox("Normalize paragraph emotion scores", value=True)
+# ✅ Placeholder for download buttons at the top
+download_placeholder = st.empty()
+if st.button("🔍 Analyze"):
+    with st.spinner("Running analysis... ⏳"):
+        if uploaded_file:
+            articles = read_and_split_articles(uploaded_file)
+            text_to_analyze = "\n\n".join(articles) if articles else ""
+        elif text_input.strip():
+            text_to_analyze = text_input
+        else:
+            st.warning("Please upload a DOCX file or paste text to analyze.")
+            st.stop()
+        detected_lang = detect(text_to_analyze[:200]) if text_to_analyze else "en"
+        emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang)
+        results, export_rows = analyze_article(
+            text_to_analyze, detected_lang, emotion_pipeline, sentiment_pipeline, normalize_paragraphs
+        )
+        # ✅ Show download buttons at the TOP
+        df_export = pd.DataFrame(export_rows)
+        csv = df_export.to_csv(index=False).encode("utf-8")
+        with download_placeholder.container():
+            st.download_button(
+                label="⬇️ Download CSV",
+                data=csv,
+                file_name="analysis_results.csv",
+                mime="text/csv",
+            )
+            excel_buffer = io.BytesIO()
+            df_export.to_excel(excel_buffer, index=False, engine="xlsxwriter")
+            st.download_button(
+                label="⬇️ Download Excel",
+                data=excel_buffer,
+                file_name="analysis_results.xlsx",
+                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+            )

requirements.txt CHANGED Viewed

@@ -9,7 +9,10 @@ langdetect
 openpyxl
 xlsxwriter
 lxml[html_clean]
-newspaper3k
 # ✅ SpaCy and models

 openpyxl
 xlsxwriter
 lxml[html_clean]
+newspaper3k==0.2.8
 # ✅ SpaCy and models