from collections import Counter from concurrent.futures import ThreadPoolExecutor # palarell processing import matplotlib.pyplot as plt import pandas as pd import praw # Reddit's API import re # Regular expression module import streamlit as st import time import torch import numpy as np from wordcloud import WordCloud from transformers import ( pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, TokenClassificationPipeline, T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM ) from transformers.pipelines import AggregationStrategy from functions import ( scrape_reddit_data, safe_sentiment, analyze_detail, preprocess_text, generate_variants, contains_excluded_keywords, extract_terms, # remove_excluded_from_list, process_extracted_result ) # ---------- Cached function for loading the model pipelines ---------- @st.cache_resource(show_spinner=False) def summarizer(text, prompt, max_length=600, min_length=10): tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn") # Tokenize the prompt and article separately without adding special tokens prompt_tokens = tokenizer(prompt, add_special_tokens=False, return_tensors="pt")["input_ids"] article_tokens = tokenizer(text, add_special_tokens=False, return_tensors="pt")["input_ids"] # Concatenate prompt and article tokens combined_input_ids = torch.cat([prompt_tokens, article_tokens], dim=-1) # skip the too large input which has more than 1024 tokens if len(combined_input_ids[0]) > 1024: return None # st.write(len(combined_input_ids[0])) # Convert the tensor to a list and add special tokens as required by the model input_ids_list = tokenizer.build_inputs_with_special_tokens(combined_input_ids[0].tolist()) input_ids = torch.tensor([input_ids_list]) # Generate the summary summary_ids = model.generate(input_ids, max_length=max_length, min_length=min_length, do_sample=False) summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary @st.cache_resource(show_spinner=False) def load_sentiment_pipeline(): # fine-turned sentiment pipeline tokenizer = AutoTokenizer.from_pretrained("kusa04/CustomModel_reddit") model = AutoModelForSequenceClassification.from_pretrained( "kusa04/CustomModel_reddit", use_auth_token=st.secrets["hugging_face_with_my_fine_turning_model"], # use_auth_token=st.secrets["hugging_face_token"] ) sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0) # -1 to 0 max_tokens = tokenizer.model_max_length if max_tokens > 10000: max_tokens = 200 return sentiment_pipeline, tokenizer, max_tokens @st.cache_resource(show_spinner=False) class KeyphraseExtractionPipeline(TokenClassificationPipeline): def __init__(self, model, *args, **kwargs): super().__init__( model=AutoModelForTokenClassification.from_pretrained(model), tokenizer=AutoTokenizer.from_pretrained(model), *args, **kwargs ) def postprocess(self, all_outputs): results = super().postprocess( all_outputs=all_outputs, aggregation_strategy=AggregationStrategy.SIMPLE, ) return np.unique([result.get("word").strip() for result in results]) @st.cache_resource(show_spinner=False) def get_keyword_pipeline(): model_name = "ml6team/keyphrase-extraction-kbir-inspec" return KeyphraseExtractionPipeline(model=model_name) keyword_pipeline = get_keyword_pipeline() def keyword_extractor(text): try: return keyword_pipeline(text) except Exception as e: return None st.title("Scraping & Analysis of Reddit") # --- Sidebar --- with st.sidebar: st.header("Controls") user_query = st.text_input("Enter search keyword:", value="Monster Hunter Wilds") scrape_btn = st.button("Scrape") summarize_btn = st.button("Summarize") sentiment_btn = st.button("Sentiment Analysis") keyword_extraction_btn = st.button("Keyword Extraction") # --- User Input --- # user_query = st.text_input("Enter search keyword:", value="Monster Hunter Wilds") if user_query: search_query = f'"{user_query}" OR "{user_query.replace(" ", "")}"' st.session_state["user_query"] = user_query else: search_query = "" st.write("Search Query:", search_query) # Button to trigger scraping and summarizing overall text using chunking if scrape_btn: with st.spinner("Scraping..."): # progress_bar = st.progress(0) progress_text = st.empty() total_limit = 5000 # Maximum number of submissions to check df = scrape_reddit_data(search_query, total_limit) length = len(df) progress_text.text(f"Collected {length} valid posts.") st.session_state["df"] = df # Load the summarization pipeline # with st.spinner("Loading Summarizing Pipeline"): # summarizer = summarizer() if summarize_btn: df = st.session_state.get("df") if df is None or df.empty: st.write("Please run 'Scrape' with an accurate keyword first.") st.stop() # Split the "Detail" texts into a list all_details = df["Detail"].tolist() # Divide the list into chunks of 4 posts each chunk_size = 4 chunks = [all_details[i:i + chunk_size] for i in range(0, len(all_details), chunk_size)] chunk_summaries = [] # Summarize each chunk individually for idx, chunk in enumerate(chunks): # Join the posts in the chunk with "\n" chunk_text = " \n ".join(chunk) with st.spinner(f"Summarizing chunk {idx + 1} of {len(chunks)}..."): prompt = """ Please summarize the following Reddit post formally, highlighting the author's key experiences and opinions, especially about HOW HE/SHE THINK ABOUT IT. NOTE THAT every time you see indent, that means they are different users, posted by different people. So, TELL ME THE HOLE TENDENCY ACROSS MULTIPLE REDDIT USERS: """ summary_output = summarizer(chunk_text, prompt, max_length=50, min_length=2) chunk_summaries.append(summary_output) # Combine all chunk summaries using the same delimiter combined_summary_text = " \n ".join(str(chunk) + "\n\n\n\n" for chunk in chunk_summaries if chunk) # Generate an overall summary from the combined chunk summaries # with st.spinner("Generating overall summary from chunk summaries..."): # prompt = """ # Based on the above text, what kind of tendencies do you think can be perceived in the users? # I believe that there is a space between each sentence, which indicates that each belongs to a different user. # With that in mind, PLEASE EXPLAIN to me in an easy-to-understand manner what each user tends to be seeking. # """ # overall_summary_output = summarizer(combined_summary_text, prompt, max_length=600) # Display the overall summary st.subheader("Overall Summary of All Posts") st.write(combined_summary_text) # Save the DataFrame and overall summary in session state for later use st.session_state["df"] = df st.session_state["overall_summary"] = combined_summary_text # button to trigger sentiment analysis if sentiment_btn: df = st.session_state.get("df") if df is None or df.empty: st.write("Please run 'Scrape' with an accurate keyword first.") st.stop() length = len(df) with st.spinner("Loading..."): sentiment_pipeline, tokenizer, max_tokens = load_sentiment_pipeline() st.write("Loaded...") with st.spinner("Doing Sentiment Analysis..."): progress_bar = st.progress(0) # title is short, so dont havwe to use batch processing df['Title_Sentiment'] = df['Title'].apply(lambda x: \ safe_sentiment(sentiment_pipeline, preprocess_text(x), length, progress_bar) if x else None) # palarell procsssing for each row of detail with ThreadPoolExecutor() as executor: detail_sentiments = list(executor.map( lambda x: analyze_detail(x, tokenizer, sentiment_pipeline, max_tokens) if x else None, df['Detail'] )) df['Detail_Sentiment'] = detail_sentiments df["Title_Sentiment_Label"] = df["Title_Sentiment"].apply(lambda x: x["label"] if x else None) df["Title_Sentiment_Score"] = df["Title_Sentiment"].apply(lambda x: x["score"] if x else None) df["Detail_Sentiment_Label"] = df["Detail_Sentiment"].apply(lambda x: x["label"] if x else None) df["Detail_Sentiment_Score"] = df["Detail_Sentiment"].apply(lambda x: x["score"] if x else None) df = df.drop(["Title_Sentiment", "Detail_Sentiment"], axis=1) cols = ["Title", "Title_Sentiment_Label", "Title_Sentiment_Score", "Detail", "Detail_Sentiment_Label", "Detail_Sentiment_Score", "Date"] df = df[cols] st.session_state["df"] = df with st.spinner("Drawing Sentiment Graphs..."): # ① create yyyy-mm colmns df["YearMonth"] = pd.to_datetime(df["Date"]).dt.to_period("M").astype(str) df["Title_Sentiment_Label"] = df["Title_Sentiment_Label"].str.lower() df["Detail_Sentiment_Label"] = df["Detail_Sentiment_Label"].str.lower() # ② groupby and pivot title & detail title_counts = df.groupby(["YearMonth", "Title_Sentiment_Label"]).size().reset_index(name="count") detail_counts = df.groupby(["YearMonth", "Detail_Sentiment_Label"]).size().reset_index(name="count") # ③ pivot → index=YearMonth, columns=sentiment, values=count title_pivot = title_counts.pivot(index="YearMonth", columns="Title_Sentiment_Label", values="count").fillna(0) detail_pivot = detail_counts.pivot(index="YearMonth", columns="Detail_Sentiment_Label", values="count").fillna(0) # Sort title_pivot = title_pivot.sort_index() detail_pivot = detail_pivot.sort_index() # --- ④ Visualize title graph --- fig1, ax1 = plt.subplots(figsize=(15, 6)) # stacked bar plot title_pivot.plot(kind="bar", stacked=True, ax=ax1, color={ "positive": "orange", "neutral": "yellowgreen", "negative": "blue" }) # line graph # for sentiment, color in zip(["positive", "neutral", "negative"], ["darkorange", "green", "navy"]): # if sentiment in title_pivot.columns: # ax1.plot(title_pivot.index, title_pivot[sentiment], label=f"{sentiment} trend", marker="o", color=color, linestyle="--") title_cum = title_pivot.cumsum(axis=1) for sentiment, color in zip(["positive", "neutral", "negative"], ["darkorange", "green", "navy"]): if sentiment in title_cum.columns: ax1.plot(title_cum.index, title_cum[sentiment], label=f"{sentiment} trend", marker="o", color=color, linestyle="--") ax1.set_title("Monthly Title Sentiment Counts") ax1.set_xlabel("Time (YYYY-MM)") ax1.set_ylabel("Count") ax1.legend() plt.xticks(rotation=45) st.pyplot(fig1) # --- ⑤ Visualize detail --- fig2, ax2 = plt.subplots(figsize=(15, 6)) # stacked bar plot detail_pivot.plot(kind="bar", stacked=True, ax=ax2, color={ "positive": "darkorange", "neutral": "forestgreen", "negative": "navy" }) # line graph # for sentiment, color in zip(["positive", "neutral", "negative"], ["orangered", "limegreen", "darkblue"]): # if sentiment in detail_pivot.columns: # ax2.plot(detail_pivot.index, detail_pivot[sentiment], label=f"{sentiment} trend", marker="o", color=color, linestyle="--") detail_cum = detail_pivot.cumsum(axis=1) for sentiment, color in zip(["positive", "neutral", "negative"], ["orangered", "limegreen", "darkblue"]): if sentiment in detail_pivot.columns: ax2.plot(detail_cum.index, detail_cum[sentiment], label=f"{sentiment} trend", marker="o", color=color, linestyle="--") ax2.set_title("Monthly Detail Sentiment Counts") ax2.set_xlabel("Time (YYYY-MM)") ax2.set_ylabel("Count") ax2.legend() plt.xticks(rotation=45) st.pyplot(fig2) if keyword_extraction_btn: df = st.session_state.get("df") user_query = st.session_state.get("user_query") if (df is None or df.empty) or (user_query is None): st.write("Please run 'Scrape' with an accurate keyword first.") st.stop() else: with st.spinner("Extracting Keyword..."): target_col = "Detail_Keyword" # 並列処理で各 'Detail' に対してキーワード抽出を実行 with ThreadPoolExecutor() as executor: results = list( executor.map(lambda x: keyword_extractor(preprocess_text(x)) if x else None, df['Detail']) ) df[target_col] = results # st.write("df: ", df[target_col].head().to_list()) # generate exclude keyword excluded_keywords = generate_variants(user_query) df_filtered = df[~df[target_col].apply( lambda cell: contains_excluded_keywords(cell, excluded_keywords=excluded_keywords) )].copy() # st.write("filtered: ", df_filtered[target_col].head().to_list()) # conver to list terms_list = df_filtered[target_col].dropna().apply(lambda x: extract_terms(x)) terms = [term for sublist in terms_list for term in sublist] # st.write("term_list: ", terms_list) # st.write("terms:", terms) # count frequency freq = Counter(terms) # st.write("freq:", freq) with st.spinner("Drawing Keywords Diagram..."): if freq: # st.write("freq preview:", list(freq.items())[:10]) # optional debug wc = WordCloud(width=800, height=400, background_color="white") wc.generate_from_frequencies(freq) fig, ax = plt.subplots(figsize=(10, 5)) ax.imshow(wc, interpolation="bilinear") ax.axis("off") st.pyplot(fig) else: st.warning("No keywords to display.")