test2025SpL2

Sleeping

App Files Files Community

ysuneu commited on May 13

Commit

fda07e6

verified ·

1 Parent(s): 9a29b32

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -271

app.py CHANGED Viewed

@@ -3,200 +3,9 @@ import pandas as pd
 from transformers import pipeline
 import tempfile
 import os
-from typing import List, Dict
-import matplotlib.pyplot as plt
-@st.cache_resource
-def load_model():
-    """Load and cache the sentiment analysis model"""
-    try:
-        return pipeline(
-            "text-classification",
-            model="KeonBlackwell/movie_sentiment_model",
-            tokenizer="distilbert-base-uncased"
-        )
-    except Exception as e:
-        st.error(f"模型加载失败: {str(e)}")
-        return None
-def analyze_comments(comments: List[str], classifier) -> List[Dict]:
-    """Analyze a list of comments and return sentiment results"""
-    results = []
-    for comment in comments:
-        prediction = classifier(comment)[0]
-        results.append({
-            'comment': comment,
-            'sentiment': 1 if prediction['label'] == 'LABEL_1' else 0,
-            'confidence': prediction['score']
-        })
-    return results
-def calculate_star_rating(positive_percent: float) -> int:
-    """Convert positive percentage to star rating (1-5)"""
-    if positive_percent >= 80:
-        return 5
-    elif positive_percent >= 60:
-        return 4
-    elif positive_percent >= 40:
-        return 3
-    elif positive_percent >= 20:
-        return 2
-    return 1
-def show_sentiment_distribution(positive_percent: float):
-    """Display a pie chart of sentiment distribution"""
-    fig, ax = plt.subplots()
-    ax.pie([positive_percent, 100-positive_percent],
-           labels=['Positive', 'Negative'],
-           autopct='%1.1f%%',
-           colors=['#4CAF50', '#F44336'])
-    ax.axis('equal')  # Equal aspect ratio ensures pie is drawn as a circle
-    st.pyplot(fig)
-def main():
-    st.set_page_config(page_title="电影评论分析系统", page_icon="🎬")
-    # Custom CSS
-    st.markdown("""
-    <style>
-    .reportview-container {
-        background: #f0f2f6;
-    }
-    .stProgress > div > div > div > div {
-        background-color: #4CAF50;
-    }
-    </style>
-    """, unsafe_allow_html=True)
-    # Load model
-    classifier = load_model()
-    if classifier is None:
-        return
-    # Page layout
-    st.title("🎬 电影评论批量分析系统")
-    st.markdown("""
-    ### 使用说明：
-    1. 上传包含电影评论的CSV文件（需包含'comment'列）
-    2. 系统自动分析每条评论的情感倾向
-    3. 生成整体评分和分析报告
-    """)
-    # Sample file download
-    with st.expander("下载示例文件"):
-        sample_data = pd.DataFrame({'comment': [
-            "This movie was fantastic! The acting was superb.",
-            "I didn't like the plot. It was too predictable.",
-            "The cinematography was beautiful but the story was weak."
-        ]})
-        st.download_button(
-            label="下载示例CSV",
-            data=sample_data.to_csv(index=False).encode('utf-8'),
-            file_name="sample_reviews.csv",
-            mime="text/csv"
-        )
-    # File upload
-    uploaded_file = st.file_uploader("上传CSV文件", type=["csv"])
-    if uploaded_file is not None:
-        try:
-            df = pd.read_csv(uploaded_file)
-            if 'comment' not in df.columns:
-                st.error("CSV文件必须包含'comment'列")
-                return
-            comments = df['comment'].dropna().tolist()
-            with st.expander("原始数据预览（前5行）"):
-                st.dataframe(df.head())
-            if st.button("开始分析", type="primary"):
-                if len(comments) > 1000:
-                    st.warning(f"检测到大量评论 ({len(comments)} 条)，分析可能需要较长时间...")
-                with st.spinner("分析中，请稍候..."):
-                    results = analyze_comments(comments, classifier)
-                    result_df = pd.DataFrame(results)
-                    # Calculate statistics
-                    positive_count = result_df['sentiment'].sum()
-                    total_reviews = len(result_df)
-                    positive_percent = (positive_count / total_reviews) * 100
-                    star_rating = calculate_star_rating(positive_percent)
-                    # Display results
-                    st.success("分析完成！")
-                    # Metrics
-                    col1, col2, col3 = st.columns(3)
-                    with col1:
-                        st.metric("⭐ 综合评分", f"{star_rating} 星")
-                    with col2:
-                        st.metric("👍 正面评价", f"{positive_count}/{total_reviews}")
-                    with col3:
-                        st.metric("📈 正面比例", f"{positive_percent:.1f}%")
-                    # Visualizations
-                    show_sentiment_distribution(positive_percent)
-                    # Sample results
-                    with st.expander("查看详细分析结果（前10条）"):
-                        st.dataframe(result_df.head(10))
-                    # Download
-                    with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
-                        result_df.to_csv(tmp.name, index=False)
-                        with open(tmp.name, "rb") as f:
-                            st.download_button(
-                                label="下载完整结果",
-                                data=f,
-                                file_name="analysis_results.csv",
-                                mime="text/csv"
-                            )
-                    os.unlink(tmp.name)
-        except Exception as e:
-            st.error(f"处理文件时出错: {str(e)}")
-if __name__ == "__main__":
-    main()
-Key improvements made:
-Added model caching
-Included a sample file download
-Added a pie chart visualization
-Better organized the code into functions
-Added type hints
-Improved error handling and user feedback
-Added warning for large files
-Would you like me to explain any specific part of these improvements in more detail?
-我不需要优化，在源代码上添加以下几步，合并CSV里的每行评论，合并成一个文本，利用huggingface上的模型keyphrase_extractor = pipeline(
-    "token-classification",
-    model="ml6team/keyphrase-extraction-distilbert-inspec",
-    aggregation_strategy="simple"  # 用于合并子词/标记
-和summarizer = pipeline("summarization",
-            model="facebook/bart-large-cnn")
-进行提取5个关键词，和生成总结，我是在huggingface上设计app
-我将在你的原始代码基础上添加关键词提取和总结生成功能。以下是修改后的代码，保留了原有功能，同时增加了你要求的新特性：
-python
-import streamlit as st
-import pandas as pd
-from transformers import pipeline
-import tempfile
-import os
 def calculate_star_rating(positive_percent):
-    """将正面评价百分比转换为星级评分"""
     if positive_percent >= 80:
         return 5
     elif positive_percent >= 60:
@@ -209,9 +18,9 @@ def calculate_star_rating(positive_percent):
         return 1
 def main():
-    st.set_page_config(page_title="电影评论分析系统", page_icon="🎬")
-    # 自定义样式
     st.markdown("""
     <style>
     .reportview-container {
@@ -223,125 +32,125 @@ def main():
     </style>
     """, unsafe_allow_html=True)
-    # 模型加载
-    with st.spinner("正在加载所有模型，这可能需要几分钟..."):
         try:
-            # 情感分析模型
             classifier = pipeline(
                 "text-classification",
                 model="KeonBlackwell/movie_sentiment_model",
                 tokenizer="distilbert-base-uncased"
             )
-            # 关键词提取模型
             keyphrase_extractor = pipeline(
                 "token-classification",
                 model="ml6team/keyphrase-extraction-distilbert-inspec",
                 aggregation_strategy="simple"
             )
-            # 摘要生成模型
             summarizer = pipeline("summarization",
                                 model="facebook/bart-large-cnn")
         except Exception as e:
-            st.error(f"模型加载失败: {str(e)}")
             return
-    # 页面布局
-    st.title("🎬 电影评论批量分析系统")
     st.markdown("""
-    ### 使用说明：
-    1. 上传包含电影评论的CSV文件（需包含'comment'列）
-    2. 系统自动分析每条评论的情感倾向
-    3. 生成整体评分、关键词提取和总结报告
     """)
-    # 文件上传
-    uploaded_file = st.file_uploader("上传CSV文件", type=["csv"])
     if uploaded_file is not None:
-        # 读取数据
         try:
             df = pd.read_csv(uploaded_file)
             if 'comment' not in df.columns:
-                st.error("CSV文件必须包含'comment'列")
                 return
             comments = df['comment'].tolist()
         except Exception as e:
-            st.error(f"文件读取失败: {str(e)}")
             return
-        # 显示预览
-        with st.expander("原始数据预览（前5行）"):
             st.dataframe(df.head())
-        if st.button("开始分析"):
-            # 进度条设置
             progress_bar = st.progress(0)
             status_text = st.empty()
             results = []
             total = len(comments)
-            # 批量预测
             try:
-                # 情感分析
                 for i, comment in enumerate(comments):
                     progress = (i+1)/total
                     progress_bar.progress(progress)
-                    status_text.text(f"正在分析情感 {i+1}/{total} 条评论...")
                     prediction = classifier(comment)[0]
                     results.append({
                         'comment': comment,
                         'sentiment': 1 if prediction['label'] == 'LABEL_1' else 0,
                         'confidence': prediction['score']
                     })
-                # 转换为DataFrame
                 result_df = pd.DataFrame(results)
-                # 计算统计指标
                 positive_count = result_df['sentiment'].sum()
                 total_reviews = len(result_df)
                 positive_percent = (positive_count / total_reviews) * 100
                 star_rating = calculate_star_rating(positive_percent)
-                # 显示结果
-                st.success("情感分析完成！")
-                # 评分展示
                 col1, col2, col3 = st.columns(3)
                 with col1:
-                    st.metric("⭐ 综合评分", f"{star_rating} 星")
                 with col2:
-                    st.metric("👍 正面评价", f"{positive_count}/{total_reviews}")
                 with col3:
-                    st.metric("📈 正面比例", f"{positive_percent:.1f}%")
-                # 进度条可视化
                 st.progress(positive_percent/100)
-                # 显示示例结果
-                with st.expander("查看详细分析结果（前10条）"):
                     st.dataframe(result_df.head(10))
-                # 关键词提取和总结
-                st.subheader("📌 评论关键词提取与总结")
-                # 合并所有评论为一个文本
                 combined_text = " ".join(comments)
-                # 关键词提取
-                with st.spinner("正在提取关键词..."):
                     keyphrases = keyphrase_extractor(combined_text)
-                    # 按置信度排序并取前5个
                     top_keyphrases = sorted(keyphrases, key=lambda x: x['score'], reverse=True)[:5]
-                # 显示关键词
-                st.markdown("**🔍 提取的关键词:**")
                 cols = st.columns(5)
                 for i, phrase in enumerate(top_keyphrases):
                     cols[i].markdown(f"""
@@ -354,40 +163,40 @@ def main():
                         background-color: #f9f9f9;
                     ">
                         <b>{phrase['word']}</b><br>
-                        <small>置信度: {phrase['score']:.2f}</small>
                     </div>
                     """, unsafe_allow_html=True)
-                # 生成总结
-                with st.spinner("正在生成评论总结..."):
-                    # 限制文本长度以避免模型限制
-                    max_length = 1024  # 模型的最大输入长度
                     if len(combined_text) > max_length:
                         combined_text = combined_text[:max_length]
                     summary = summarizer(combined_text,
                                         max_length=130,
                                         min_length=30,
                                         do_sample=False)
-                # 显示总结
-                st.markdown("**📝 评论总结:**")
                 st.info(summary[0]['summary_text'])
-                # 生成可下载文件
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
                     result_df.to_csv(tmp.name, index=False)
                     with open(tmp.name, "rb") as f:
                         st.download_button(
-                            label="下载完整结果",
                             data=f,
                             file_name="analysis_results.csv",
                             mime="text/csv"
                         )
                 os.unlink(tmp.name)
             except Exception as e:
-                st.error(f"分析过程中出现错误: {str(e)}")
             finally:
                 progress_bar.empty()
                 status_text.empty()

 from transformers import pipeline
 import tempfile
 import os
 def calculate_star_rating(positive_percent):
+    """Convert positive percentage to star rating"""
     if positive_percent >= 80:
         return 5
     elif positive_percent >= 60:
         return 1
 def main():
+    st.set_page_config(page_title="Movie Review Analysis System", page_icon="🎬")
+    # Custom styles
     st.markdown("""
     <style>
     .reportview-container {
     </style>
     """, unsafe_allow_html=True)
+    # Model loading
+    with st.spinner("Loading all models, this may take a few minutes..."):
         try:
+            # Sentiment analysis model
             classifier = pipeline(
                 "text-classification",
                 model="KeonBlackwell/movie_sentiment_model",
                 tokenizer="distilbert-base-uncased"
             )
+            # Keyphrase extraction model
             keyphrase_extractor = pipeline(
                 "token-classification",
                 model="ml6team/keyphrase-extraction-distilbert-inspec",
                 aggregation_strategy="simple"
             )
+            # Summarization model
             summarizer = pipeline("summarization",
                                 model="facebook/bart-large-cnn")
         except Exception as e:
+            st.error(f"Model loading failed: {str(e)}")
             return
+    # Page layout
+    st.title("🎬 Movie Review Batch Analysis System")
     st.markdown("""
+    ### Instructions:
+    1. Upload a CSV file containing movie reviews (must include a 'comment' column)
+    2. The system will automatically analyze the sentiment of each review
+    3. Generate overall ratings, keyphrase extraction, and summary reports
     """)
+    # File upload
+    uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
     if uploaded_file is not None:
+        # Read data
         try:
             df = pd.read_csv(uploaded_file)
             if 'comment' not in df.columns:
+                st.error("The CSV file must contain a 'comment' column")
                 return
             comments = df['comment'].tolist()
         except Exception as e:
+            st.error(f"File reading failed: {str(e)}")
             return
+        # Show preview
+        with st.expander("Preview of Original Data (First 5 Rows)"):
             st.dataframe(df.head())
+        if st.button("Start Analysis"):
+            # Progress bar settings
             progress_bar = st.progress(0)
             status_text = st.empty()
             results = []
             total = len(comments)
+            # Batch prediction
             try:
+                # Sentiment analysis
                 for i, comment in enumerate(comments):
                     progress = (i+1)/total
                     progress_bar.progress(progress)
+                    status_text.text(f"Analyzing sentiment for {i+1}/{total} reviews...")
                     prediction = classifier(comment)[0]
                     results.append({
                         'comment': comment,
                         'sentiment': 1 if prediction['label'] == 'LABEL_1' else 0,
                         'confidence': prediction['score']
                     })
+                # Convert to DataFrame
                 result_df = pd.DataFrame(results)
+                # Calculate statistics
                 positive_count = result_df['sentiment'].sum()
                 total_reviews = len(result_df)
                 positive_percent = (positive_count / total_reviews) * 100
                 star_rating = calculate_star_rating(positive_percent)
+                # Show results
+                st.success("Sentiment analysis completed!")
+                # Rating display
                 col1, col2, col3 = st.columns(3)
                 with col1:
+                    st.metric("⭐ Overall Rating", f"{star_rating} Stars")
                 with col2:
+                    st.metric("👍 Positive Reviews", f"{positive_count}/{total_reviews}")
                 with col3:
+                    st.metric("📈 Positive Ratio", f"{positive_percent:.1f}%")
+                # Progress bar visualization
                 st.progress(positive_percent/100)
+                # Show example results
+                with st.expander("View Detailed Analysis Results (First 10 Rows)"):
                     st.dataframe(result_df.head(10))
+                # Keyphrase extraction and summary
+                st.subheader("📌 Keyphrase Extraction and Summary of Reviews")
+                # Combine all comments into a single text
                 combined_text = " ".join(comments)
+                # Keyphrase extraction
+                with st.spinner("Extracting keyphrases..."):
                     keyphrases = keyphrase_extractor(combined_text)
+                    # Sort by confidence and take the top 5
                     top_keyphrases = sorted(keyphrases, key=lambda x: x['score'], reverse=True)[:5]
+                # Show keyphrases
+                st.markdown("**🔍 Extracted Keyphrases:**")
                 cols = st.columns(5)
                 for i, phrase in enumerate(top_keyphrases):
                     cols[i].markdown(f"""
                         background-color: #f9f9f9;
                     ">
                         <b>{phrase['word']}</b><br>
+                        <small>Confidence: {phrase['score']:.2f}</small>
                     </div>
                     """, unsafe_allow_html=True)
+                # Generate summary
+                with st.spinner("Generating review summary..."):
+                    # Limit text length to avoid model limitations
+                    max_length = 1024  # Maximum input length for the model
                     if len(combined_text) > max_length:
                         combined_text = combined_text[:max_length]
                     summary = summarizer(combined_text,
                                         max_length=130,
                                         min_length=30,
                                         do_sample=False)
+                # Show summary
+                st.markdown("**📝 Review Summary:**")
                 st.info(summary[0]['summary_text'])
+                # Generate downloadable file
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
                     result_df.to_csv(tmp.name, index=False)
                     with open(tmp.name, "rb") as f:
                         st.download_button(
+                            label="Download Full Results",
                             data=f,
                             file_name="analysis_results.csv",
                             mime="text/csv"
                         )
                 os.unlink(tmp.name)
             except Exception as e:
+                st.error(f"An error occurred during analysis: {str(e)}")
             finally:
                 progress_bar.empty()
                 status_text.empty()