ysuneu commited on
Commit
fda07e6
·
verified ·
1 Parent(s): 9a29b32

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -271
app.py CHANGED
@@ -3,200 +3,9 @@ import pandas as pd
3
  from transformers import pipeline
4
  import tempfile
5
  import os
6
- from typing import List, Dict
7
- import matplotlib.pyplot as plt
8
-
9
- @st.cache_resource
10
- def load_model():
11
- """Load and cache the sentiment analysis model"""
12
- try:
13
- return pipeline(
14
- "text-classification",
15
- model="KeonBlackwell/movie_sentiment_model",
16
- tokenizer="distilbert-base-uncased"
17
- )
18
- except Exception as e:
19
- st.error(f"模型加载失败: {str(e)}")
20
- return None
21
-
22
- def analyze_comments(comments: List[str], classifier) -> List[Dict]:
23
- """Analyze a list of comments and return sentiment results"""
24
- results = []
25
- for comment in comments:
26
- prediction = classifier(comment)[0]
27
- results.append({
28
- 'comment': comment,
29
- 'sentiment': 1 if prediction['label'] == 'LABEL_1' else 0,
30
- 'confidence': prediction['score']
31
- })
32
- return results
33
-
34
- def calculate_star_rating(positive_percent: float) -> int:
35
- """Convert positive percentage to star rating (1-5)"""
36
- if positive_percent >= 80:
37
- return 5
38
- elif positive_percent >= 60:
39
- return 4
40
- elif positive_percent >= 40:
41
- return 3
42
- elif positive_percent >= 20:
43
- return 2
44
- return 1
45
-
46
- def show_sentiment_distribution(positive_percent: float):
47
- """Display a pie chart of sentiment distribution"""
48
- fig, ax = plt.subplots()
49
- ax.pie([positive_percent, 100-positive_percent],
50
- labels=['Positive', 'Negative'],
51
- autopct='%1.1f%%',
52
- colors=['#4CAF50', '#F44336'])
53
- ax.axis('equal') # Equal aspect ratio ensures pie is drawn as a circle
54
- st.pyplot(fig)
55
-
56
- def main():
57
- st.set_page_config(page_title="电影评论分析系统", page_icon="🎬")
58
-
59
- # Custom CSS
60
- st.markdown("""
61
- <style>
62
- .reportview-container {
63
- background: #f0f2f6;
64
- }
65
- .stProgress > div > div > div > div {
66
- background-color: #4CAF50;
67
- }
68
- </style>
69
- """, unsafe_allow_html=True)
70
-
71
- # Load model
72
- classifier = load_model()
73
- if classifier is None:
74
- return
75
-
76
- # Page layout
77
- st.title("🎬 电影评论批量分析系统")
78
- st.markdown("""
79
- ### 使用说明:
80
- 1. 上传包含电影评论的CSV文件(需包含'comment'列)
81
- 2. 系统自动分析每条评论的情感倾向
82
- 3. 生成整体评分和分析报告
83
- """)
84
-
85
- # Sample file download
86
- with st.expander("下载示例文件"):
87
- sample_data = pd.DataFrame({'comment': [
88
- "This movie was fantastic! The acting was superb.",
89
- "I didn't like the plot. It was too predictable.",
90
- "The cinematography was beautiful but the story was weak."
91
- ]})
92
- st.download_button(
93
- label="下载示例CSV",
94
- data=sample_data.to_csv(index=False).encode('utf-8'),
95
- file_name="sample_reviews.csv",
96
- mime="text/csv"
97
- )
98
-
99
- # File upload
100
- uploaded_file = st.file_uploader("上传CSV文件", type=["csv"])
101
-
102
- if uploaded_file is not None:
103
- try:
104
- df = pd.read_csv(uploaded_file)
105
- if 'comment' not in df.columns:
106
- st.error("CSV文件必须包含'comment'列")
107
- return
108
-
109
- comments = df['comment'].dropna().tolist()
110
-
111
- with st.expander("原始数据预览(前5行)"):
112
- st.dataframe(df.head())
113
-
114
- if st.button("开始分析", type="primary"):
115
- if len(comments) > 1000:
116
- st.warning(f"检测到大量评论 ({len(comments)} 条),分析可能需要较长时间...")
117
-
118
- with st.spinner("分析中,请稍候..."):
119
- results = analyze_comments(comments, classifier)
120
- result_df = pd.DataFrame(results)
121
-
122
- # Calculate statistics
123
- positive_count = result_df['sentiment'].sum()
124
- total_reviews = len(result_df)
125
- positive_percent = (positive_count / total_reviews) * 100
126
- star_rating = calculate_star_rating(positive_percent)
127
-
128
- # Display results
129
- st.success("分析完成!")
130
-
131
- # Metrics
132
- col1, col2, col3 = st.columns(3)
133
- with col1:
134
- st.metric("⭐ 综合评分", f"{star_rating} 星")
135
- with col2:
136
- st.metric("👍 正面评价", f"{positive_count}/{total_reviews}")
137
- with col3:
138
- st.metric("📈 正面比例", f"{positive_percent:.1f}%")
139
-
140
- # Visualizations
141
- show_sentiment_distribution(positive_percent)
142
-
143
- # Sample results
144
- with st.expander("查看详细分析结果(前10条)"):
145
- st.dataframe(result_df.head(10))
146
-
147
- # Download
148
- with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
149
- result_df.to_csv(tmp.name, index=False)
150
- with open(tmp.name, "rb") as f:
151
- st.download_button(
152
- label="下载完整结果",
153
- data=f,
154
- file_name="analysis_results.csv",
155
- mime="text/csv"
156
- )
157
- os.unlink(tmp.name)
158
-
159
- except Exception as e:
160
- st.error(f"处理文件时出错: {str(e)}")
161
-
162
- if __name__ == "__main__":
163
- main()
164
- Key improvements made:
165
-
166
- Added model caching
167
-
168
- Included a sample file download
169
-
170
- Added a pie chart visualization
171
-
172
- Better organized the code into functions
173
-
174
- Added type hints
175
-
176
- Improved error handling and user feedback
177
-
178
- Added warning for large files
179
-
180
- Would you like me to explain any specific part of these improvements in more detail?
181
-
182
- 我不需要优化,在源代码上添加以下几步,合并CSV里的每行评论,合并成一个文本,利用huggingface上的模型keyphrase_extractor = pipeline(
183
- "token-classification",
184
- model="ml6team/keyphrase-extraction-distilbert-inspec",
185
- aggregation_strategy="simple" # 用于合并子词/标记
186
- 和summarizer = pipeline("summarization",
187
- model="facebook/bart-large-cnn")
188
- 进行提取5个关键词,和生成总结,我是在huggingface上设计app
189
- 我将在你的原始代码基础上添加关键词提取和总结生成功能。以下是修改后的代码,保留了原有功能,同时增加了你要求的新特性:
190
-
191
- python
192
- import streamlit as st
193
- import pandas as pd
194
- from transformers import pipeline
195
- import tempfile
196
- import os
197
 
198
  def calculate_star_rating(positive_percent):
199
- """将正面评价百分比转换为星级评分"""
200
  if positive_percent >= 80:
201
  return 5
202
  elif positive_percent >= 60:
@@ -209,9 +18,9 @@ def calculate_star_rating(positive_percent):
209
  return 1
210
 
211
  def main():
212
- st.set_page_config(page_title="电影评论分析系统", page_icon="🎬")
213
-
214
- # 自定义样式
215
  st.markdown("""
216
  <style>
217
  .reportview-container {
@@ -223,125 +32,125 @@ def main():
223
  </style>
224
  """, unsafe_allow_html=True)
225
 
226
- # 模型加载
227
- with st.spinner("正在加载所有模型,这可能需要几分钟..."):
228
  try:
229
- # 情感分析模型
230
  classifier = pipeline(
231
  "text-classification",
232
  model="KeonBlackwell/movie_sentiment_model",
233
  tokenizer="distilbert-base-uncased"
234
  )
235
-
236
- # 关键词提取模型
237
  keyphrase_extractor = pipeline(
238
  "token-classification",
239
  model="ml6team/keyphrase-extraction-distilbert-inspec",
240
  aggregation_strategy="simple"
241
  )
242
-
243
- # 摘要生成模型
244
  summarizer = pipeline("summarization",
245
  model="facebook/bart-large-cnn")
246
-
247
  except Exception as e:
248
- st.error(f"模型加载失败: {str(e)}")
249
  return
250
 
251
- # 页面布局
252
- st.title("🎬 电影评论批量分析系统")
253
  st.markdown("""
254
- ### 使用说明:
255
- 1. 上传包含电影评论的CSV文件(需包含'comment'列)
256
- 2. 系统自动分析每条评论的情感倾向
257
- 3. 生成整体评分、关键词提取和总结报告
258
  """)
259
 
260
- # 文件上传
261
- uploaded_file = st.file_uploader("上传CSV文件", type=["csv"])
262
-
263
  if uploaded_file is not None:
264
- # 读取数据
265
  try:
266
  df = pd.read_csv(uploaded_file)
267
  if 'comment' not in df.columns:
268
- st.error("CSV文件必须包含'comment'")
269
  return
270
-
271
  comments = df['comment'].tolist()
272
  except Exception as e:
273
- st.error(f"文件读取失败: {str(e)}")
274
  return
275
 
276
- # 显示预览
277
- with st.expander("原始数据预览(前5行)"):
278
  st.dataframe(df.head())
279
 
280
- if st.button("开始分析"):
281
- # 进度条设置
282
  progress_bar = st.progress(0)
283
  status_text = st.empty()
284
-
285
  results = []
286
  total = len(comments)
287
-
288
- # 批量预测
289
  try:
290
- # 情感分析
291
  for i, comment in enumerate(comments):
292
  progress = (i+1)/total
293
  progress_bar.progress(progress)
294
- status_text.text(f"正在分析情感 {i+1}/{total} 条评论...")
295
-
296
  prediction = classifier(comment)[0]
297
  results.append({
298
  'comment': comment,
299
  'sentiment': 1 if prediction['label'] == 'LABEL_1' else 0,
300
  'confidence': prediction['score']
301
  })
302
-
303
- # 转换为DataFrame
304
  result_df = pd.DataFrame(results)
305
-
306
- # 计算统计指标
307
  positive_count = result_df['sentiment'].sum()
308
  total_reviews = len(result_df)
309
  positive_percent = (positive_count / total_reviews) * 100
310
  star_rating = calculate_star_rating(positive_percent)
311
-
312
- # 显示结果
313
- st.success("情感分析完成!")
314
-
315
- # 评分展示
316
  col1, col2, col3 = st.columns(3)
317
  with col1:
318
- st.metric("⭐ 综合评分", f"{star_rating} ")
319
  with col2:
320
- st.metric("👍 正面评价", f"{positive_count}/{total_reviews}")
321
  with col3:
322
- st.metric("📈 正面比例", f"{positive_percent:.1f}%")
323
-
324
- # 进度条可视化
325
  st.progress(positive_percent/100)
326
-
327
- # 显示示例结果
328
- with st.expander("查看详细分析结果(前10条)"):
329
  st.dataframe(result_df.head(10))
330
-
331
- # 关键词提取和总结
332
- st.subheader("📌 评论关键词提取与总结")
333
-
334
- # 合并所有评论为一个文本
335
  combined_text = " ".join(comments)
336
-
337
- # 关键词提取
338
- with st.spinner("正在提取关键词..."):
339
  keyphrases = keyphrase_extractor(combined_text)
340
- # 按置信度排序并取前5
341
  top_keyphrases = sorted(keyphrases, key=lambda x: x['score'], reverse=True)[:5]
342
-
343
- # 显示关键词
344
- st.markdown("**🔍 提取的关键词:**")
345
  cols = st.columns(5)
346
  for i, phrase in enumerate(top_keyphrases):
347
  cols[i].markdown(f"""
@@ -354,40 +163,40 @@ def main():
354
  background-color: #f9f9f9;
355
  ">
356
  <b>{phrase['word']}</b><br>
357
- <small>置信度: {phrase['score']:.2f}</small>
358
  </div>
359
  """, unsafe_allow_html=True)
360
-
361
- # 生成总结
362
- with st.spinner("正在生成评论总结..."):
363
- # 限制文本长度以避免模型限制
364
- max_length = 1024 # 模型的最大输入长度
365
  if len(combined_text) > max_length:
366
  combined_text = combined_text[:max_length]
367
-
368
  summary = summarizer(combined_text,
369
  max_length=130,
370
  min_length=30,
371
  do_sample=False)
372
-
373
- # 显示总结
374
- st.markdown("**📝 评论总结:**")
375
  st.info(summary[0]['summary_text'])
376
-
377
- # 生成可下载文件
378
  with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
379
  result_df.to_csv(tmp.name, index=False)
380
  with open(tmp.name, "rb") as f:
381
  st.download_button(
382
- label="下载完整结果",
383
  data=f,
384
  file_name="analysis_results.csv",
385
  mime="text/csv"
386
  )
387
  os.unlink(tmp.name)
388
-
389
  except Exception as e:
390
- st.error(f"分析过程中出现错误: {str(e)}")
391
  finally:
392
  progress_bar.empty()
393
  status_text.empty()
 
3
  from transformers import pipeline
4
  import tempfile
5
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def calculate_star_rating(positive_percent):
8
+ """Convert positive percentage to star rating"""
9
  if positive_percent >= 80:
10
  return 5
11
  elif positive_percent >= 60:
 
18
  return 1
19
 
20
  def main():
21
+ st.set_page_config(page_title="Movie Review Analysis System", page_icon="🎬")
22
+
23
+ # Custom styles
24
  st.markdown("""
25
  <style>
26
  .reportview-container {
 
32
  </style>
33
  """, unsafe_allow_html=True)
34
 
35
+ # Model loading
36
+ with st.spinner("Loading all models, this may take a few minutes..."):
37
  try:
38
+ # Sentiment analysis model
39
  classifier = pipeline(
40
  "text-classification",
41
  model="KeonBlackwell/movie_sentiment_model",
42
  tokenizer="distilbert-base-uncased"
43
  )
44
+
45
+ # Keyphrase extraction model
46
  keyphrase_extractor = pipeline(
47
  "token-classification",
48
  model="ml6team/keyphrase-extraction-distilbert-inspec",
49
  aggregation_strategy="simple"
50
  )
51
+
52
+ # Summarization model
53
  summarizer = pipeline("summarization",
54
  model="facebook/bart-large-cnn")
55
+
56
  except Exception as e:
57
+ st.error(f"Model loading failed: {str(e)}")
58
  return
59
 
60
+ # Page layout
61
+ st.title("🎬 Movie Review Batch Analysis System")
62
  st.markdown("""
63
+ ### Instructions:
64
+ 1. Upload a CSV file containing movie reviews (must include a 'comment' column)
65
+ 2. The system will automatically analyze the sentiment of each review
66
+ 3. Generate overall ratings, keyphrase extraction, and summary reports
67
  """)
68
 
69
+ # File upload
70
+ uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
71
+
72
  if uploaded_file is not None:
73
+ # Read data
74
  try:
75
  df = pd.read_csv(uploaded_file)
76
  if 'comment' not in df.columns:
77
+ st.error("The CSV file must contain a 'comment' column")
78
  return
79
+
80
  comments = df['comment'].tolist()
81
  except Exception as e:
82
+ st.error(f"File reading failed: {str(e)}")
83
  return
84
 
85
+ # Show preview
86
+ with st.expander("Preview of Original Data (First 5 Rows)"):
87
  st.dataframe(df.head())
88
 
89
+ if st.button("Start Analysis"):
90
+ # Progress bar settings
91
  progress_bar = st.progress(0)
92
  status_text = st.empty()
93
+
94
  results = []
95
  total = len(comments)
96
+
97
+ # Batch prediction
98
  try:
99
+ # Sentiment analysis
100
  for i, comment in enumerate(comments):
101
  progress = (i+1)/total
102
  progress_bar.progress(progress)
103
+ status_text.text(f"Analyzing sentiment for {i+1}/{total} reviews...")
104
+
105
  prediction = classifier(comment)[0]
106
  results.append({
107
  'comment': comment,
108
  'sentiment': 1 if prediction['label'] == 'LABEL_1' else 0,
109
  'confidence': prediction['score']
110
  })
111
+
112
+ # Convert to DataFrame
113
  result_df = pd.DataFrame(results)
114
+
115
+ # Calculate statistics
116
  positive_count = result_df['sentiment'].sum()
117
  total_reviews = len(result_df)
118
  positive_percent = (positive_count / total_reviews) * 100
119
  star_rating = calculate_star_rating(positive_percent)
120
+
121
+ # Show results
122
+ st.success("Sentiment analysis completed!")
123
+
124
+ # Rating display
125
  col1, col2, col3 = st.columns(3)
126
  with col1:
127
+ st.metric("⭐ Overall Rating", f"{star_rating} Stars")
128
  with col2:
129
+ st.metric("👍 Positive Reviews", f"{positive_count}/{total_reviews}")
130
  with col3:
131
+ st.metric("📈 Positive Ratio", f"{positive_percent:.1f}%")
132
+
133
+ # Progress bar visualization
134
  st.progress(positive_percent/100)
135
+
136
+ # Show example results
137
+ with st.expander("View Detailed Analysis Results (First 10 Rows)"):
138
  st.dataframe(result_df.head(10))
139
+
140
+ # Keyphrase extraction and summary
141
+ st.subheader("📌 Keyphrase Extraction and Summary of Reviews")
142
+
143
+ # Combine all comments into a single text
144
  combined_text = " ".join(comments)
145
+
146
+ # Keyphrase extraction
147
+ with st.spinner("Extracting keyphrases..."):
148
  keyphrases = keyphrase_extractor(combined_text)
149
+ # Sort by confidence and take the top 5
150
  top_keyphrases = sorted(keyphrases, key=lambda x: x['score'], reverse=True)[:5]
151
+
152
+ # Show keyphrases
153
+ st.markdown("**🔍 Extracted Keyphrases:**")
154
  cols = st.columns(5)
155
  for i, phrase in enumerate(top_keyphrases):
156
  cols[i].markdown(f"""
 
163
  background-color: #f9f9f9;
164
  ">
165
  <b>{phrase['word']}</b><br>
166
+ <small>Confidence: {phrase['score']:.2f}</small>
167
  </div>
168
  """, unsafe_allow_html=True)
169
+
170
+ # Generate summary
171
+ with st.spinner("Generating review summary..."):
172
+ # Limit text length to avoid model limitations
173
+ max_length = 1024 # Maximum input length for the model
174
  if len(combined_text) > max_length:
175
  combined_text = combined_text[:max_length]
176
+
177
  summary = summarizer(combined_text,
178
  max_length=130,
179
  min_length=30,
180
  do_sample=False)
181
+
182
+ # Show summary
183
+ st.markdown("**📝 Review Summary:**")
184
  st.info(summary[0]['summary_text'])
185
+
186
+ # Generate downloadable file
187
  with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
188
  result_df.to_csv(tmp.name, index=False)
189
  with open(tmp.name, "rb") as f:
190
  st.download_button(
191
+ label="Download Full Results",
192
  data=f,
193
  file_name="analysis_results.csv",
194
  mime="text/csv"
195
  )
196
  os.unlink(tmp.name)
197
+
198
  except Exception as e:
199
+ st.error(f"An error occurred during analysis: {str(e)}")
200
  finally:
201
  progress_bar.empty()
202
  status_text.empty()