Kartikay Khosla commited on
Commit
14d6a4f
Β·
1 Parent(s): 149d94e

Update app.py and requirements.txt with URL support and emotion filter

Browse files
Files changed (5) hide show
  1. .DS_Store +0 -0
  2. SAURL +313 -0
  3. SAURL.py +313 -0
  4. app copy.py +295 -0
  5. requirements.txt +4 -1
.DS_Store ADDED
Binary file (6.15 kB). View file
 
SAURL ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import spacy
3
+ import stanza
4
+ import pandas as pd
5
+ import re
6
+ import docx
7
+ from collections import Counter
8
+ import stanza
9
+ from transformers import pipeline
10
+ import torch
11
+ from langdetect import detect
12
+ import streamlit as st
13
+ import io
14
+ from newspaper import Article # βœ… for URL input
15
+
16
+ # ===============================
17
+ # πŸ”§ Safe SpaCy + Stanza Downloads
18
+ # ===============================
19
+ def safe_load_spacy():
20
+ try:
21
+ return spacy.load("en_core_web_trf")
22
+ except OSError:
23
+ try:
24
+ return spacy.load("en_core_web_sm")
25
+ except OSError:
26
+ os.system("python -m spacy download en_core_web_sm")
27
+ return spacy.load("en_core_web_sm")
28
+
29
+ nlp_en = safe_load_spacy()
30
+
31
+ stanza_dir = os.path.expanduser("~/.stanza_resources")
32
+ if not os.path.exists(stanza_dir):
33
+ stanza.download('hi')
34
+ stanza.download('ta')
35
+
36
+ stanza.download('hi')
37
+ stanza.download('ta')
38
+
39
+ nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
40
+ nlp_ta = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
41
+
42
+
43
+ # ===============================
44
+ # Language-Aware Pipeline Loader
45
+ # ===============================
46
+ def load_pipelines(language_code):
47
+ lang = language_code.upper()
48
+ device = 0 if torch.cuda.is_available() else -1
49
+ st.write(f"🌍 Language detected: {lang}")
50
+ st.write(f"Device set to use {'cuda:0' if device == 0 else 'cpu'}")
51
+
52
+ if lang == "EN":
53
+ emo_model = "SamLowe/roberta-base-go_emotions"
54
+ elif lang in ["HI", "TA"]:
55
+ emo_model = "bhadresh-savani/bert-base-go-emotion"
56
+ else:
57
+ emo_model = "SamLowe/roberta-base-go_emotions"
58
+
59
+ emotion_pipeline = pipeline(
60
+ "text-classification",
61
+ model=emo_model,
62
+ tokenizer=emo_model,
63
+ return_all_scores=True,
64
+ device=device
65
+ )
66
+
67
+ if lang == "EN":
68
+ sent_model = "distilbert-base-uncased-finetuned-sst-2-english"
69
+ else:
70
+ sent_model = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"
71
+
72
+ sentiment_pipeline = pipeline(
73
+ "text-classification",
74
+ model=sent_model,
75
+ tokenizer=sent_model,
76
+ return_all_scores=True,
77
+ device=device
78
+ )
79
+
80
+ return emotion_pipeline, sentiment_pipeline
81
+
82
+
83
+ # ===============================
84
+ # DOCX Reader – keep paras separate
85
+ # ===============================
86
+ def read_and_split_articles(file_path):
87
+ doc = docx.Document(file_path)
88
+ paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
89
+ return paragraphs
90
+
91
+
92
+ # ===============================
93
+ # URL Reader – title + main body
94
+ # ===============================
95
+ def read_article_from_url(url):
96
+ article = Article(url)
97
+ article.download()
98
+ article.parse()
99
+ title = article.title.strip()
100
+ body = article.text.strip()
101
+ full_text = f"{title}\n\n{body}"
102
+ return full_text
103
+
104
+
105
+ # ===============================
106
+ # Filter Neutral
107
+ # ===============================
108
+ def filter_neutral(emotion_results, neutral_threshold=0.75):
109
+ scores = {r["label"]: round(r["score"], 3)
110
+ for r in sorted(emotion_results, key=lambda x: x["score"], reverse=True)}
111
+ if "neutral" in scores and scores["neutral"] > neutral_threshold:
112
+ scores.pop("neutral")
113
+ return scores
114
+
115
+
116
+ # ===============================
117
+ # Sentence Splitter
118
+ # ===============================
119
+ def split_sentences(text, lang):
120
+ if lang == "hi":
121
+ sentences = re.split(r'ΰ₯€', text)
122
+ elif lang == "ta":
123
+ sentences = re.split(r'\.', text)
124
+ else:
125
+ doc = nlp_en(text)
126
+ sentences = [sent.text.strip() for sent in doc.sents]
127
+ return [s.strip() for s in sentences if s.strip()]
128
+
129
+
130
+ # ===============================
131
+ # POS Tagger
132
+ # ===============================
133
+ def get_pos_tags(sentence, lang):
134
+ if lang == "en":
135
+ doc = nlp_en(sentence)
136
+ return [(token.text, token.pos_) for token in doc]
137
+ elif lang == "hi":
138
+ doc = nlp_hi(sentence)
139
+ return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
140
+ elif lang == "ta":
141
+ doc = nlp_ta(sentence)
142
+ return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
143
+ else:
144
+ return []
145
+
146
+
147
+ # ===============================
148
+ # Analysis Function
149
+ # ===============================
150
+ def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
151
+ results_summary = []
152
+ export_rows = []
153
+ para_counters = []
154
+ emotion_to_sentences = {}
155
+
156
+ paragraphs = [p.strip() for p in article_text.split("\n\n") if p.strip()]
157
+ if len(paragraphs) <= 1:
158
+ paragraphs = [p.strip() for p in article_text.split("\n") if p.strip()]
159
+
160
+ # Weighted overall results
161
+ weighted_scores = {}
162
+ total_length = 0
163
+ all_sentiments = []
164
+
165
+ for para in paragraphs:
166
+ sentences = split_sentences(para, lang[:2])
167
+ for sentence in sentences:
168
+ emo_results = emotion_pipeline(sentence[:512])[0]
169
+ filtered = filter_neutral(emo_results)
170
+ length = len(sentence.split())
171
+ total_length += length
172
+ for emo, score in filtered.items():
173
+ weighted_scores[emo] = weighted_scores.get(emo, 0) + score * length
174
+ sentiment_results = sentiment_pipeline(sentence[:512])[0]
175
+ all_sentiments.append(max(sentiment_results, key=lambda x: x["score"]))
176
+
177
+ if total_length > 0:
178
+ weighted_scores = {emo: round(val / total_length, 3) for emo, val in weighted_scores.items()}
179
+
180
+ overall_sentiment = max(all_sentiments, key=lambda x: x["score"]) if all_sentiments else {}
181
+
182
+ st.subheader("πŸ“Š OVERALL (Weighted)")
183
+ st.write("Emotions β†’", weighted_scores)
184
+ st.write("Sentiment β†’", overall_sentiment)
185
+
186
+ export_rows.append({
187
+ "Type": "Overall",
188
+ "Text": "Weighted across article",
189
+ "Emotions": weighted_scores,
190
+ "Sentiment": overall_sentiment
191
+ })
192
+
193
+ # Paragraph-level
194
+ for p_idx, para in enumerate(paragraphs, start=1):
195
+ para_counter = Counter()
196
+ sentences = split_sentences(para, lang[:2])
197
+ for sentence in sentences:
198
+ results = emotion_pipeline(sentence[:512])[0]
199
+ filtered = filter_neutral(results, neutral_threshold=0.75)
200
+ for emo, score in filtered.items():
201
+ para_counter[emo] += score
202
+ if emo not in emotion_to_sentences:
203
+ emotion_to_sentences[emo] = []
204
+ if emo in sorted(filtered, key=filtered.get, reverse=True)[:5]:
205
+ emotion_to_sentences[emo].append(f"(Para {p_idx}) {sentence}")
206
+
207
+ para_counters.append((para, dict(sorted(para_counter.items(), key=lambda x:x[1], reverse=True))))
208
+ st.write(f"\nπŸ“‘ Paragraph {p_idx}: {para}")
209
+ st.write("Emotions β†’", para_counters[-1][1])
210
+
211
+ export_rows.append({
212
+ "Type": "Paragraph",
213
+ "Text": para,
214
+ "Emotions": para_counters[-1][1],
215
+ "Sentiment": ""
216
+ })
217
+
218
+ # Sentence-level
219
+ st.subheader("πŸ“ SENTENCES")
220
+ for para in paragraphs:
221
+ sentences = split_sentences(para, lang[:2])
222
+ for sentence in sentences:
223
+ pos_tags = get_pos_tags(sentence, lang[:2])
224
+ results = emotion_pipeline(sentence[:512])[0]
225
+ filtered = filter_neutral(results, neutral_threshold=0.75)
226
+ sentiment_results = sentiment_pipeline(sentence[:512])[0]
227
+ best_sentiment = max(sentiment_results, key=lambda x: x["score"])
228
+ results_summary.append({
229
+ "sentence": sentence,
230
+ "pos_tags": pos_tags,
231
+ "emotions": filtered,
232
+ "sentiment": best_sentiment
233
+ })
234
+ st.write(f"Sentence: {sentence}")
235
+ st.write(f"POS Tags β†’ {pos_tags}")
236
+ st.write(f"Emotions β†’ {filtered}")
237
+ st.write(f"Sentiment β†’ {best_sentiment['label']} ({round(best_sentiment['score'],4)})\n")
238
+
239
+ for emo in sorted(filtered, key=filtered.get, reverse=True)[:5]:
240
+ if emo not in emotion_to_sentences:
241
+ emotion_to_sentences[emo] = []
242
+ emotion_to_sentences[emo].append(f"(Sentence) {sentence}")
243
+
244
+ export_rows.append({
245
+ "Type": "Sentence",
246
+ "Text": sentence,
247
+ "Emotions": filtered,
248
+ "Sentiment": best_sentiment
249
+ })
250
+
251
+ return results_summary, export_rows, emotion_to_sentences
252
+
253
+
254
+ # ===============================
255
+ # Streamlit App
256
+ # ===============================
257
+ st.title("πŸ“‘ Multilingual Text Emotion + Sentiment Analyzer")
258
+
259
+ uploaded_file = st.file_uploader("Upload a DOCX file", type=["docx"])
260
+ url_input = st.text_input("Or enter an Article URL")
261
+ text_input = st.text_area("Or paste text here")
262
+
263
+ if st.button("πŸ” Analyze"):
264
+ with st.spinner("Running analysis... ⏳"):
265
+ if uploaded_file:
266
+ articles = read_and_split_articles(uploaded_file)
267
+ text_to_analyze = "\n\n".join(articles)
268
+ elif url_input.strip():
269
+ text_to_analyze = read_article_from_url(url_input)
270
+ elif text_input.strip():
271
+ text_to_analyze = text_input
272
+ else:
273
+ st.warning("Please upload a DOCX, enter a URL, or paste text to analyze.")
274
+ st.stop()
275
+
276
+ detected_lang = detect(text_to_analyze[:200]) if text_to_analyze else "en"
277
+ emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang)
278
+ results, export_rows, emotion_to_sentences = analyze_article(
279
+ text_to_analyze, detected_lang, emotion_pipeline, sentiment_pipeline
280
+ )
281
+
282
+ # βœ… Download buttons FIRST
283
+ df_export = pd.DataFrame(export_rows)
284
+ csv = df_export.to_csv(index=False).encode("utf-8")
285
+
286
+ st.download_button(
287
+ label="⬇️ Download CSV",
288
+ data=csv,
289
+ file_name="analysis_results.csv",
290
+ mime="text/csv",
291
+ )
292
+
293
+ excel_buffer = io.BytesIO()
294
+ df_export.to_excel(excel_buffer, index=False, engine="xlsxwriter")
295
+ st.download_button(
296
+ label="⬇️ Download Excel",
297
+ data=excel_buffer,
298
+ file_name="analysis_results.xlsx",
299
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
300
+ )
301
+
302
+ # βœ… Emotion filter tabs at the end
303
+ if emotion_to_sentences and len(emotion_to_sentences) > 0:
304
+ st.subheader("🎭 Explore by Emotion (Top 5 only)")
305
+ emotion_list = list(emotion_to_sentences.keys())
306
+ tabs = st.tabs(emotion_list)
307
+ for idx, emo in enumerate(emotion_list):
308
+ with tabs[idx]:
309
+ st.write(f"### πŸ”Ή {emo.upper()}")
310
+ for text in emotion_to_sentences[emo]:
311
+ st.write(f"- {text}")
312
+ else:
313
+ st.info("No emotions strong enough to show in Top 5 filters.")
SAURL.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import spacy
3
+ import stanza
4
+ import pandas as pd
5
+ import re
6
+ import docx
7
+ from collections import Counter
8
+ import stanza
9
+ from transformers import pipeline
10
+ import torch
11
+ from langdetect import detect
12
+ import streamlit as st
13
+ import io
14
+ from newspaper import Article # βœ… for URL input
15
+
16
+ # ===============================
17
+ # πŸ”§ Safe SpaCy + Stanza Downloads
18
+ # ===============================
19
+ def safe_load_spacy():
20
+ try:
21
+ return spacy.load("en_core_web_trf")
22
+ except OSError:
23
+ try:
24
+ return spacy.load("en_core_web_sm")
25
+ except OSError:
26
+ os.system("python -m spacy download en_core_web_sm")
27
+ return spacy.load("en_core_web_sm")
28
+
29
+ nlp_en = safe_load_spacy()
30
+
31
+ stanza_dir = os.path.expanduser("~/.stanza_resources")
32
+ if not os.path.exists(stanza_dir):
33
+ stanza.download('hi')
34
+ stanza.download('ta')
35
+
36
+ stanza.download('hi')
37
+ stanza.download('ta')
38
+
39
+ nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
40
+ nlp_ta = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
41
+
42
+
43
+ # ===============================
44
+ # Language-Aware Pipeline Loader
45
+ # ===============================
46
+ def load_pipelines(language_code):
47
+ lang = language_code.upper()
48
+ device = 0 if torch.cuda.is_available() else -1
49
+ st.write(f"🌍 Language detected: {lang}")
50
+ st.write(f"Device set to use {'cuda:0' if device == 0 else 'cpu'}")
51
+
52
+ if lang == "EN":
53
+ emo_model = "SamLowe/roberta-base-go_emotions"
54
+ elif lang in ["HI", "TA"]:
55
+ emo_model = "bhadresh-savani/bert-base-go-emotion"
56
+ else:
57
+ emo_model = "SamLowe/roberta-base-go_emotions"
58
+
59
+ emotion_pipeline = pipeline(
60
+ "text-classification",
61
+ model=emo_model,
62
+ tokenizer=emo_model,
63
+ return_all_scores=True,
64
+ device=device
65
+ )
66
+
67
+ if lang == "EN":
68
+ sent_model = "distilbert-base-uncased-finetuned-sst-2-english"
69
+ else:
70
+ sent_model = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"
71
+
72
+ sentiment_pipeline = pipeline(
73
+ "text-classification",
74
+ model=sent_model,
75
+ tokenizer=sent_model,
76
+ return_all_scores=True,
77
+ device=device
78
+ )
79
+
80
+ return emotion_pipeline, sentiment_pipeline
81
+
82
+
83
+ # ===============================
84
+ # DOCX Reader – keep paras separate
85
+ # ===============================
86
+ def read_and_split_articles(file_path):
87
+ doc = docx.Document(file_path)
88
+ paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
89
+ return paragraphs
90
+
91
+
92
+ # ===============================
93
+ # URL Reader – title + main body
94
+ # ===============================
95
+ def read_article_from_url(url):
96
+ article = Article(url)
97
+ article.download()
98
+ article.parse()
99
+ title = article.title.strip()
100
+ body = article.text.strip()
101
+ full_text = f"{title}\n\n{body}"
102
+ return full_text
103
+
104
+
105
+ # ===============================
106
+ # Filter Neutral
107
+ # ===============================
108
+ def filter_neutral(emotion_results, neutral_threshold=0.75):
109
+ scores = {r["label"]: round(r["score"], 3)
110
+ for r in sorted(emotion_results, key=lambda x: x["score"], reverse=True)}
111
+ if "neutral" in scores and scores["neutral"] > neutral_threshold:
112
+ scores.pop("neutral")
113
+ return scores
114
+
115
+
116
+ # ===============================
117
+ # Sentence Splitter
118
+ # ===============================
119
+ def split_sentences(text, lang):
120
+ if lang == "hi":
121
+ sentences = re.split(r'ΰ₯€', text)
122
+ elif lang == "ta":
123
+ sentences = re.split(r'\.', text)
124
+ else:
125
+ doc = nlp_en(text)
126
+ sentences = [sent.text.strip() for sent in doc.sents]
127
+ return [s.strip() for s in sentences if s.strip()]
128
+
129
+
130
+ # ===============================
131
+ # POS Tagger
132
+ # ===============================
133
+ def get_pos_tags(sentence, lang):
134
+ if lang == "en":
135
+ doc = nlp_en(sentence)
136
+ return [(token.text, token.pos_) for token in doc]
137
+ elif lang == "hi":
138
+ doc = nlp_hi(sentence)
139
+ return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
140
+ elif lang == "ta":
141
+ doc = nlp_ta(sentence)
142
+ return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
143
+ else:
144
+ return []
145
+
146
+
147
+ # ===============================
148
+ # Analysis Function
149
+ # ===============================
150
+ def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline):
151
+ results_summary = []
152
+ export_rows = []
153
+ para_counters = []
154
+ emotion_to_sentences = {}
155
+
156
+ paragraphs = [p.strip() for p in article_text.split("\n\n") if p.strip()]
157
+ if len(paragraphs) <= 1:
158
+ paragraphs = [p.strip() for p in article_text.split("\n") if p.strip()]
159
+
160
+ # Weighted overall results
161
+ weighted_scores = {}
162
+ total_length = 0
163
+ all_sentiments = []
164
+
165
+ for para in paragraphs:
166
+ sentences = split_sentences(para, lang[:2])
167
+ for sentence in sentences:
168
+ emo_results = emotion_pipeline(sentence[:512])[0]
169
+ filtered = filter_neutral(emo_results)
170
+ length = len(sentence.split())
171
+ total_length += length
172
+ for emo, score in filtered.items():
173
+ weighted_scores[emo] = weighted_scores.get(emo, 0) + score * length
174
+ sentiment_results = sentiment_pipeline(sentence[:512])[0]
175
+ all_sentiments.append(max(sentiment_results, key=lambda x: x["score"]))
176
+
177
+ if total_length > 0:
178
+ weighted_scores = {emo: round(val / total_length, 3) for emo, val in weighted_scores.items()}
179
+
180
+ overall_sentiment = max(all_sentiments, key=lambda x: x["score"]) if all_sentiments else {}
181
+
182
+ st.subheader("πŸ“Š OVERALL (Weighted)")
183
+ st.write("Emotions β†’", weighted_scores)
184
+ st.write("Sentiment β†’", overall_sentiment)
185
+
186
+ export_rows.append({
187
+ "Type": "Overall",
188
+ "Text": "Weighted across article",
189
+ "Emotions": weighted_scores,
190
+ "Sentiment": overall_sentiment
191
+ })
192
+
193
+ # Paragraph-level
194
+ for p_idx, para in enumerate(paragraphs, start=1):
195
+ para_counter = Counter()
196
+ sentences = split_sentences(para, lang[:2])
197
+ for sentence in sentences:
198
+ results = emotion_pipeline(sentence[:512])[0]
199
+ filtered = filter_neutral(results, neutral_threshold=0.75)
200
+ for emo, score in filtered.items():
201
+ para_counter[emo] += score
202
+ if emo not in emotion_to_sentences:
203
+ emotion_to_sentences[emo] = []
204
+ if emo in sorted(filtered, key=filtered.get, reverse=True)[:5]:
205
+ emotion_to_sentences[emo].append(f"(Para {p_idx}) {sentence}")
206
+
207
+ para_counters.append((para, dict(sorted(para_counter.items(), key=lambda x:x[1], reverse=True))))
208
+ st.write(f"\nπŸ“‘ Paragraph {p_idx}: {para}")
209
+ st.write("Emotions β†’", para_counters[-1][1])
210
+
211
+ export_rows.append({
212
+ "Type": "Paragraph",
213
+ "Text": para,
214
+ "Emotions": para_counters[-1][1],
215
+ "Sentiment": ""
216
+ })
217
+
218
+ # Sentence-level
219
+ st.subheader("πŸ“ SENTENCES")
220
+ for para in paragraphs:
221
+ sentences = split_sentences(para, lang[:2])
222
+ for sentence in sentences:
223
+ pos_tags = get_pos_tags(sentence, lang[:2])
224
+ results = emotion_pipeline(sentence[:512])[0]
225
+ filtered = filter_neutral(results, neutral_threshold=0.75)
226
+ sentiment_results = sentiment_pipeline(sentence[:512])[0]
227
+ best_sentiment = max(sentiment_results, key=lambda x: x["score"])
228
+ results_summary.append({
229
+ "sentence": sentence,
230
+ "pos_tags": pos_tags,
231
+ "emotions": filtered,
232
+ "sentiment": best_sentiment
233
+ })
234
+ st.write(f"Sentence: {sentence}")
235
+ st.write(f"POS Tags β†’ {pos_tags}")
236
+ st.write(f"Emotions β†’ {filtered}")
237
+ st.write(f"Sentiment β†’ {best_sentiment['label']} ({round(best_sentiment['score'],4)})\n")
238
+
239
+ for emo in sorted(filtered, key=filtered.get, reverse=True)[:5]:
240
+ if emo not in emotion_to_sentences:
241
+ emotion_to_sentences[emo] = []
242
+ emotion_to_sentences[emo].append(f"(Sentence) {sentence}")
243
+
244
+ export_rows.append({
245
+ "Type": "Sentence",
246
+ "Text": sentence,
247
+ "Emotions": filtered,
248
+ "Sentiment": best_sentiment
249
+ })
250
+
251
+ return results_summary, export_rows, emotion_to_sentences
252
+
253
+
254
+ # ===============================
255
+ # Streamlit App
256
+ # ===============================
257
+ st.title("πŸ“‘ Multilingual Text Emotion + Sentiment Analyzer")
258
+
259
+ uploaded_file = st.file_uploader("Upload a DOCX file", type=["docx"])
260
+ url_input = st.text_input("Or enter an Article URL")
261
+ text_input = st.text_area("Or paste text here")
262
+
263
+ if st.button("πŸ” Analyze"):
264
+ with st.spinner("Running analysis... ⏳"):
265
+ if uploaded_file:
266
+ articles = read_and_split_articles(uploaded_file)
267
+ text_to_analyze = "\n\n".join(articles)
268
+ elif url_input.strip():
269
+ text_to_analyze = read_article_from_url(url_input)
270
+ elif text_input.strip():
271
+ text_to_analyze = text_input
272
+ else:
273
+ st.warning("Please upload a DOCX, enter a URL, or paste text to analyze.")
274
+ st.stop()
275
+
276
+ detected_lang = detect(text_to_analyze[:200]) if text_to_analyze else "en"
277
+ emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang)
278
+ results, export_rows, emotion_to_sentences = analyze_article(
279
+ text_to_analyze, detected_lang, emotion_pipeline, sentiment_pipeline
280
+ )
281
+
282
+ # βœ… Download buttons FIRST
283
+ df_export = pd.DataFrame(export_rows)
284
+ csv = df_export.to_csv(index=False).encode("utf-8")
285
+
286
+ st.download_button(
287
+ label="⬇️ Download CSV",
288
+ data=csv,
289
+ file_name="analysis_results.csv",
290
+ mime="text/csv",
291
+ )
292
+
293
+ excel_buffer = io.BytesIO()
294
+ df_export.to_excel(excel_buffer, index=False, engine="xlsxwriter")
295
+ st.download_button(
296
+ label="⬇️ Download Excel",
297
+ data=excel_buffer,
298
+ file_name="analysis_results.xlsx",
299
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
300
+ )
301
+
302
+ # βœ… Emotion filter tabs at the end
303
+ if emotion_to_sentences and len(emotion_to_sentences) > 0:
304
+ st.subheader("🎭 Explore by Emotion (Top 5 only)")
305
+ emotion_list = list(emotion_to_sentences.keys())
306
+ tabs = st.tabs(emotion_list)
307
+ for idx, emo in enumerate(emotion_list):
308
+ with tabs[idx]:
309
+ st.write(f"### πŸ”Ή {emo.upper()}")
310
+ for text in emotion_to_sentences[emo]:
311
+ st.write(f"- {text}")
312
+ else:
313
+ st.info("No emotions strong enough to show in Top 5 filters.")
app copy.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import spacy
3
+ import stanza
4
+
5
+ # ===============================
6
+ # πŸ”§ Safe SpaCy + Stanza Downloads
7
+ # ===============================
8
+ def safe_load_spacy():
9
+ try:
10
+ return spacy.load("en_core_web_trf")
11
+ except OSError:
12
+ try:
13
+ return spacy.load("en_core_web_sm")
14
+ except OSError:
15
+ os.system("python -m spacy download en_core_web_sm")
16
+ return spacy.load("en_core_web_sm")
17
+
18
+ # βœ… Initialize English SpaCy safely
19
+ nlp_en = safe_load_spacy()
20
+
21
+ # Ensure Stanza models exist
22
+ stanza_dir = os.path.expanduser("~/.stanza_resources")
23
+ if not os.path.exists(stanza_dir):
24
+ stanza.download('hi')
25
+ stanza.download('ta')
26
+
27
+ # ===============================
28
+ # 1️⃣ Imports
29
+ # ===============================
30
+ import pandas as pd
31
+ import re
32
+ import docx
33
+ from collections import Counter
34
+ import stanza
35
+ from transformers import pipeline
36
+ import torch
37
+ from langdetect import detect
38
+ import streamlit as st
39
+ import io
40
+
41
+ # ===============================
42
+ # 2️⃣ Pre-download Stanza models
43
+ # ===============================
44
+ stanza.download('hi')
45
+ stanza.download('ta')
46
+
47
+ # ===============================
48
+ # 3️⃣ Initialize Stanza for Hindi/Tamil
49
+ # ===============================
50
+ nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
51
+ nlp_ta = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=torch.cuda.is_available())
52
+
53
+ # ===============================
54
+ # 4️⃣ Language-Aware Pipeline Loader
55
+ # ===============================
56
+ def load_pipelines(language_code):
57
+ lang = language_code.upper()
58
+ device = 0 if torch.cuda.is_available() else -1
59
+ st.write(f"🌍 Language detected: {lang}")
60
+ st.write(f"Device set to use {'cuda:0' if device == 0 else 'cpu'}")
61
+
62
+ # Emotion model
63
+ if lang == "EN":
64
+ emo_model = "SamLowe/roberta-base-go_emotions"
65
+ elif lang in ["HI", "TA"]:
66
+ emo_model = "bhadresh-savani/bert-base-go-emotion"
67
+ else:
68
+ emo_model = "SamLowe/roberta-base-go_emotions"
69
+
70
+ emotion_pipeline = pipeline(
71
+ "text-classification",
72
+ model=emo_model,
73
+ tokenizer=emo_model,
74
+ return_all_scores=True,
75
+ device=device
76
+ )
77
+
78
+ # Sentiment model
79
+ if lang == "EN":
80
+ sent_model = "distilbert-base-uncased-finetuned-sst-2-english"
81
+ else:
82
+ sent_model = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"
83
+
84
+ sentiment_pipeline = pipeline(
85
+ "text-classification",
86
+ model=sent_model,
87
+ tokenizer=sent_model,
88
+ return_all_scores=True,
89
+ device=device
90
+ )
91
+
92
+ return emotion_pipeline, sentiment_pipeline
93
+
94
+ # ===============================
95
+ # 5️⃣ Read DOCX and split articles
96
+ # ===============================
97
+ def read_and_split_articles(file_path):
98
+ doc = docx.Document(file_path)
99
+ paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
100
+ return paragraphs # βœ… Each docx paragraph separately
101
+
102
+ # ===============================
103
+ # 6️⃣ Utility – Filter Neutral
104
+ # ===============================
105
+ def filter_neutral(emotion_results, neutral_threshold=0.75):
106
+ scores = {r["label"]: round(r["score"], 3)
107
+ for r in sorted(emotion_results, key=lambda x: x["score"], reverse=True)}
108
+ if "neutral" in scores and scores["neutral"] > neutral_threshold:
109
+ scores.pop("neutral")
110
+ return scores
111
+
112
+ # ===============================
113
+ # 7️⃣ Sentence Splitter
114
+ # ===============================
115
+ def split_sentences(text, lang):
116
+ if lang == "hi":
117
+ sentences = re.split(r'ΰ₯€', text)
118
+ elif lang == "ta":
119
+ sentences = re.split(r'\.', text)
120
+ else:
121
+ doc = nlp_en(text)
122
+ sentences = [sent.text.strip() for sent in doc.sents]
123
+ return [s.strip() for s in sentences if s.strip()]
124
+
125
+ # ===============================
126
+ # 8️⃣ PoS Tagger
127
+ # ===============================
128
+ def get_pos_tags(sentence, lang):
129
+ if lang == "en":
130
+ doc = nlp_en(sentence)
131
+ return [(token.text, token.pos_) for token in doc]
132
+ elif lang == "hi":
133
+ doc = nlp_hi(sentence)
134
+ return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
135
+ elif lang == "ta":
136
+ doc = nlp_ta(sentence)
137
+ return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
138
+ else:
139
+ return []
140
+
141
+ # ===============================
142
+ # 9️⃣ Analysis Function
143
+ # ===============================
144
+ def analyze_article(article_text, lang, emotion_pipeline, sentiment_pipeline, normalize_paragraphs):
145
+ results_summary = []
146
+ export_rows = []
147
+ para_counters = []
148
+ article_counter = Counter()
149
+
150
+ paragraphs = [p.strip() for p in article_text.split("\n\n") if p.strip()]
151
+
152
+ # -------------------------------
153
+ # βœ… Weighted Overall results
154
+ weighted_scores = {}
155
+ total_length = 0
156
+ all_sentiments = []
157
+
158
+ for para in paragraphs:
159
+ sentences = split_sentences(para, lang[:2])
160
+ for sentence in sentences:
161
+ emo_results = emotion_pipeline(sentence[:512])[0]
162
+ filtered = filter_neutral(emo_results)
163
+ length = len(sentence.split())
164
+ total_length += length
165
+ for emo, score in filtered.items():
166
+ weighted_scores[emo] = weighted_scores.get(emo, 0) + score * length
167
+ sentiment_results = sentiment_pipeline(sentence[:512])[0]
168
+ all_sentiments.append(max(sentiment_results, key=lambda x: x["score"]))
169
+
170
+ if total_length > 0:
171
+ weighted_scores = {emo: round(val / total_length, 3) for emo, val in weighted_scores.items()}
172
+
173
+ overall_sentiment = max(all_sentiments, key=lambda x: x["score"]) if all_sentiments else {}
174
+
175
+ st.subheader("πŸ“Š OVERALL (Weighted)")
176
+ st.write("Emotions β†’", weighted_scores)
177
+ st.write("Sentiment β†’", overall_sentiment)
178
+
179
+ export_rows.append({
180
+ "Type": "Overall",
181
+ "Text": "Weighted across article",
182
+ "Emotions": weighted_scores,
183
+ "Sentiment": overall_sentiment
184
+ })
185
+
186
+ # -------------------------------
187
+ # Paragraph-level
188
+ for p_idx, para in enumerate(paragraphs, start=1):
189
+ para_counter = Counter()
190
+ sentences = split_sentences(para, lang[:2])
191
+ for sentence in sentences:
192
+ results = emotion_pipeline(sentence[:512])[0]
193
+ filtered = filter_neutral(results, neutral_threshold=0.75)
194
+ for emo, score in filtered.items():
195
+ para_counter[emo] += score
196
+
197
+ if normalize_paragraphs:
198
+ # βœ… Normalize scores so they sum ≀ 1
199
+ total = sum(para_counter.values())
200
+ if total > 0:
201
+ para_counter = {emo: round(val / total, 3) for emo, val in para_counter.items()}
202
+
203
+ para_counters.append((para, dict(sorted(para_counter.items(), key=lambda x:x[1], reverse=True))))
204
+ st.write(f"\nπŸ“‘ Paragraph {p_idx}: {para}")
205
+ st.write("Emotions β†’", para_counters[-1][1])
206
+
207
+ export_rows.append({
208
+ "Type": "Paragraph",
209
+ "Text": para,
210
+ "Emotions": para_counters[-1][1],
211
+ "Sentiment": ""
212
+ })
213
+
214
+ # -------------------------------
215
+ # Sentence-level
216
+ st.subheader("πŸ“ SENTENCES")
217
+ for para in paragraphs:
218
+ sentences = split_sentences(para, lang[:2])
219
+ for sentence in sentences:
220
+ pos_tags = get_pos_tags(sentence, lang[:2])
221
+ results = emotion_pipeline(sentence[:512])[0]
222
+ filtered = filter_neutral(results, neutral_threshold=0.75)
223
+ sentiment_results = sentiment_pipeline(sentence[:512])[0]
224
+ best_sentiment = max(sentiment_results, key=lambda x: x["score"])
225
+ results_summary.append({
226
+ "sentence": sentence,
227
+ "pos_tags": pos_tags,
228
+ "emotions": filtered,
229
+ "sentiment": best_sentiment
230
+ })
231
+ st.write(f"Sentence: {sentence}")
232
+ st.write(f"POS Tags β†’ {pos_tags}")
233
+ st.write(f"Emotions β†’ {filtered}")
234
+ st.write(f"Sentiment β†’ {best_sentiment['label']} ({round(best_sentiment['score'],4)})\n")
235
+
236
+ export_rows.append({
237
+ "Type": "Sentence",
238
+ "Text": sentence,
239
+ "Emotions": filtered,
240
+ "Sentiment": best_sentiment
241
+ })
242
+
243
+ return results_summary, export_rows
244
+
245
+ # ===============================
246
+ # πŸ”Ÿ Streamlit App
247
+ # ===============================
248
+ st.title("πŸ“‘ Multilingual Text Emotion + Sentiment Analyzer")
249
+
250
+ uploaded_file = st.file_uploader("Upload a DOCX file", type=["docx"])
251
+ text_input = st.text_area("Or paste text here")
252
+
253
+ # βœ… Checkbox for paragraph normalization
254
+ normalize_paragraphs = st.checkbox("Normalize paragraph emotion scores", value=True)
255
+
256
+ # βœ… Placeholder for download buttons at the top
257
+ download_placeholder = st.empty()
258
+
259
+ if st.button("πŸ” Analyze"):
260
+ with st.spinner("Running analysis... ⏳"):
261
+ if uploaded_file:
262
+ articles = read_and_split_articles(uploaded_file)
263
+ text_to_analyze = "\n\n".join(articles) if articles else ""
264
+ elif text_input.strip():
265
+ text_to_analyze = text_input
266
+ else:
267
+ st.warning("Please upload a DOCX file or paste text to analyze.")
268
+ st.stop()
269
+
270
+ detected_lang = detect(text_to_analyze[:200]) if text_to_analyze else "en"
271
+ emotion_pipeline, sentiment_pipeline = load_pipelines(detected_lang)
272
+ results, export_rows = analyze_article(
273
+ text_to_analyze, detected_lang, emotion_pipeline, sentiment_pipeline, normalize_paragraphs
274
+ )
275
+
276
+ # βœ… Show download buttons at the TOP
277
+ df_export = pd.DataFrame(export_rows)
278
+ csv = df_export.to_csv(index=False).encode("utf-8")
279
+
280
+ with download_placeholder.container():
281
+ st.download_button(
282
+ label="⬇️ Download CSV",
283
+ data=csv,
284
+ file_name="analysis_results.csv",
285
+ mime="text/csv",
286
+ )
287
+
288
+ excel_buffer = io.BytesIO()
289
+ df_export.to_excel(excel_buffer, index=False, engine="xlsxwriter")
290
+ st.download_button(
291
+ label="⬇️ Download Excel",
292
+ data=excel_buffer,
293
+ file_name="analysis_results.xlsx",
294
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
295
+ )
requirements.txt CHANGED
@@ -9,7 +9,10 @@ langdetect
9
  openpyxl
10
  xlsxwriter
11
  lxml[html_clean]
12
- newspaper3k
 
 
 
13
 
14
 
15
  # βœ… SpaCy and models
 
9
  openpyxl
10
  xlsxwriter
11
  lxml[html_clean]
12
+ newspaper3k==0.2.8
13
+
14
+
15
+
16
 
17
 
18
  # βœ… SpaCy and models