diff --git a/rstat_tool/main.py b/rstat_tool/main.py index 44e839a..5437030 100644 --- a/rstat_tool/main.py +++ b/rstat_tool/main.py @@ -14,7 +14,7 @@ import yfinance as yf import pandas as pd from . import database -from .ticker_extractor import extract_tickers +from .ticker_extractor import extract_golden_tickers, extract_potential_tickers from .sentiment_analyzer import get_sentiment_score from .logger_setup import setup_logging, logger as log @@ -65,91 +65,75 @@ def fetch_financial_data(ticker_symbol): def _process_submission(submission, subreddit_id, conn, comment_limit): """ - Processes a single Reddit submission to find and save mentions. - Crucially, it returns a set of all unique ticker symbols found. - FINANCIAL DATA IS NOT FETCHED HERE. + Processes a single Reddit submission using the "Golden Ticker" logic. + - Prioritizes tickers with a '$' prefix. + - Falls back to potential tickers only if no '$' tickers are found. """ - tickers_in_title = set(extract_tickers(submission.title)) - all_tickers_found_in_post = set(tickers_in_title) - ticker_id_cache = {} + # 1. --- Golden Ticker Discovery --- + # First, search the entire post (title and body) for high-confidence '$' tickers. + post_text_for_discovery = submission.title + " " + submission.selftext + golden_tickers = extract_golden_tickers(post_text_for_discovery) + + tickers_in_title = set() + comment_only_tickers = set() + all_tickers_found_in_post = set() + # 2. --- Apply Contextual Logic --- + if golden_tickers: + # --- CASE A: Golden Tickers were found --- + log.info(f" -> Golden Ticker(s) Found: {', '.join(golden_tickers)}. Prioritizing these.") + all_tickers_found_in_post.update(golden_tickers) + # We only care about which of the golden tickers appeared in the title for the hybrid logic. + tickers_in_title = {ticker for ticker in golden_tickers if ticker in extract_golden_tickers(submission.title)} + else: + # --- CASE B: No Golden Tickers, fall back to best-guess --- + log.info(" -> No Golden Tickers. Falling back to potential ticker search.") + # Now we search for potential tickers (e.g., 'GME' without a '$') + tickers_in_title = extract_potential_tickers(submission.title) + all_tickers_found_in_post.update(tickers_in_title) + + # 3. --- Mention Processing (This logic remains the same, but uses our cleanly identified tickers) --- + ticker_id_cache = {} submission.comments.replace_more(limit=0) all_comments = submission.comments.list()[:comment_limit] - # Process title mentions first + # Process title mentions if tickers_in_title: - log.info( - f" -> Title Mention(s): {', '.join(tickers_in_title)}. Attributing all comments." - ) + log.info(f" -> Title Mention(s): {', '.join(tickers_in_title)}. Attributing all comments.") post_sentiment = get_sentiment_score(submission.title) for ticker_symbol in tickers_in_title: - ticker_id = database.get_or_create_entity( - conn, "tickers", "symbol", ticker_symbol - ) + ticker_id = database.get_or_create_entity(conn, 'tickers', 'symbol', ticker_symbol) ticker_id_cache[ticker_symbol] = ticker_id - database.add_mention( - conn, - ticker_id, - subreddit_id, - submission.id, - "post", - int(submission.created_utc), - post_sentiment, - ) + database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'post', int(submission.created_utc), post_sentiment) # Process comments for comment in all_comments: comment_sentiment = get_sentiment_score(comment.body) if tickers_in_title: - # If title has tickers, every comment is a mention for them for ticker_symbol in tickers_in_title: - ticker_id = ticker_id_cache[ticker_symbol] # Guaranteed to be in cache - database.add_mention( - conn, - ticker_id, - subreddit_id, - submission.id, - "comment", - int(comment.created_utc), - comment_sentiment, - ) + ticker_id = ticker_id_cache[ticker_symbol] + database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'comment', int(comment.created_utc), comment_sentiment) else: - # Otherwise, only direct mentions in comments count - tickers_in_comment = set(extract_tickers(comment.body)) + # If no title tickers, we must scan comments for potential tickers + tickers_in_comment = extract_potential_tickers(comment.body) if tickers_in_comment: all_tickers_found_in_post.update(tickers_in_comment) for ticker_symbol in tickers_in_comment: - ticker_id = database.get_or_create_entity( - conn, "tickers", "symbol", ticker_symbol - ) - database.add_mention( - conn, - ticker_id, - subreddit_id, - submission.id, - "comment", - int(comment.created_utc), - comment_sentiment, - ) + ticker_id = database.get_or_create_entity(conn, 'tickers', 'symbol', ticker_symbol) + database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'comment', int(comment.created_utc), comment_sentiment) - # Save deep dive analysis (this is separate from mention counting) + # 4. --- Save Deep Dive and Return Tickers for Financial Update --- + # (This part is unchanged) all_comment_sentiments = [get_sentiment_score(c.body) for c in all_comments] - avg_sentiment = ( - sum(all_comment_sentiments) / len(all_comment_sentiments) - if all_comment_sentiments - else 0 - ) + avg_sentiment = sum(all_comment_sentiments) / len(all_comment_sentiments) if all_comment_sentiments else 0 post_analysis_data = { - "post_id": submission.id, - "title": submission.title, - "post_url": f"https://reddit.com{submission.permalink}", - "subreddit_id": subreddit_id, - "post_timestamp": int(submission.created_utc), - "comment_count": len(all_comments), - "avg_comment_sentiment": avg_sentiment, + "post_id": submission.id, "title": submission.title, + "post_url": f"https://reddit.com{submission.permalink}", "subreddit_id": subreddit_id, + "post_timestamp": int(submission.created_utc), "comment_count": len(all_comments), + "avg_comment_sentiment": avg_sentiment } database.add_or_update_post_analysis(conn, post_analysis_data) - + return all_tickers_found_in_post diff --git a/rstat_tool/ticker_extractor.py b/rstat_tool/ticker_extractor.py index d9a6e89..79d8885 100644 --- a/rstat_tool/ticker_extractor.py +++ b/rstat_tool/ticker_extractor.py @@ -119,23 +119,26 @@ COMMON_WORDS_BLACKLIST = { "ZEN", "ZERO", "ZEV" } -def extract_tickers(text): +def extract_golden_tickers(text): """ - Extracts potential stock tickers from a given piece of text. - A ticker is identified as a 1-5 character uppercase word, or a word prefixed with $. + Extracts ONLY tickers with a '$' prefix. This is the highest-confidence signal. + Returns a set of cleaned ticker symbols (e.g., {'TSLA', 'GME'}). """ - # Regex to find potential tickers: - # 1. Words prefixed with $: $AAPL, $TSLA - # 2. All-caps words between 1 and 5 characters: GME, AMC - ticker_regex = r"\$[A-Z]{1,5}\b|\b[A-Z]{2,5}\b" + # Regex to find words prefixed with $: $AAPL, $TSLA + ticker_regex = r"\$[A-Z]{1,5}\b" + tickers = re.findall(ticker_regex, text) + # Clean the tickers by removing the '$' and return as a set + return {ticker.replace("$", "").upper() for ticker in tickers} +def extract_potential_tickers(text): + """ + Extracts potential tickers (all-caps words). This is a lower-confidence signal + used as a fallback when no golden tickers are present. + Returns a set of cleaned ticker symbols. + """ + # Regex to find all-caps words between 2 and 5 characters: GME, AMC + ticker_regex = r"\b[A-Z]{2,5}\b" potential_tickers = re.findall(ticker_regex, text) - - # Filter out common words and remove the '$' prefix - tickers = [] - for ticker in potential_tickers: - cleaned_ticker = ticker.replace("$", "").upper() - if cleaned_ticker not in COMMON_WORDS_BLACKLIST: - tickers.append(cleaned_ticker) - - return tickers + + # Filter out common blacklisted words + return {ticker for ticker in potential_tickers if ticker not in COMMON_WORDS_BLACKLIST}