Major improvement for discovering tickers. A GOLDEN TICKER IDEA!

2025-07-29 21:51:30 +02:00
parent 8a80df5946
commit 9e5455592b
2 changed files with 65 additions and 78 deletions
--- a/rstat_tool/main.py
+++ b/rstat_tool/main.py
@@ -14,7 +14,7 @@ import yfinance as yf
 import pandas as pd

 from . import database
-from .ticker_extractor import extract_tickers
+from .ticker_extractor import extract_golden_tickers, extract_potential_tickers
 from .sentiment_analyzer import get_sentiment_score
 from .logger_setup import setup_logging, logger as log

@@ -65,91 +65,75 @@ def fetch_financial_data(ticker_symbol):

 def _process_submission(submission, subreddit_id, conn, comment_limit):
    """
-    Processes a single Reddit submission to find and save mentions.
-    Crucially, it returns a set of all unique ticker symbols found.
-    FINANCIAL DATA IS NOT FETCHED HERE.
+    Processes a single Reddit submission using the "Golden Ticker" logic.
+    - Prioritizes tickers with a '$' prefix.
+    - Falls back to potential tickers only if no '$' tickers are found.
    """
-    tickers_in_title = set(extract_tickers(submission.title))
-    all_tickers_found_in_post = set(tickers_in_title)
-    ticker_id_cache = {}
+    # 1. --- Golden Ticker Discovery ---
+    # First, search the entire post (title and body) for high-confidence '$' tickers.
+    post_text_for_discovery = submission.title + " " + submission.selftext
+    golden_tickers = extract_golden_tickers(post_text_for_discovery)
+    
+    tickers_in_title = set()
+    comment_only_tickers = set()
+    all_tickers_found_in_post = set()

+    # 2. --- Apply Contextual Logic ---
+    if golden_tickers:
+        # --- CASE A: Golden Tickers were found ---
+        log.info(f"  -> Golden Ticker(s) Found: {', '.join(golden_tickers)}. Prioritizing these.")
+        all_tickers_found_in_post.update(golden_tickers)
+        # We only care about which of the golden tickers appeared in the title for the hybrid logic.
+        tickers_in_title = {ticker for ticker in golden_tickers if ticker in extract_golden_tickers(submission.title)}
+    else:
+        # --- CASE B: No Golden Tickers, fall back to best-guess ---
+        log.info("  -> No Golden Tickers. Falling back to potential ticker search.")
+        # Now we search for potential tickers (e.g., 'GME' without a '$')
+        tickers_in_title = extract_potential_tickers(submission.title)
+        all_tickers_found_in_post.update(tickers_in_title)
+
+    # 3. --- Mention Processing (This logic remains the same, but uses our cleanly identified tickers) ---
+    ticker_id_cache = {}
    submission.comments.replace_more(limit=0)
    all_comments = submission.comments.list()[:comment_limit]

-    # Process title mentions first
+    # Process title mentions
    if tickers_in_title:
-        log.info(
-            f"  -> Title Mention(s): {', '.join(tickers_in_title)}. Attributing all comments."
-        )
+        log.info(f"  -> Title Mention(s): {', '.join(tickers_in_title)}. Attributing all comments.")
        post_sentiment = get_sentiment_score(submission.title)
        for ticker_symbol in tickers_in_title:
-            ticker_id = database.get_or_create_entity(
-                conn, "tickers", "symbol", ticker_symbol
-            )
+            ticker_id = database.get_or_create_entity(conn, 'tickers', 'symbol', ticker_symbol)
            ticker_id_cache[ticker_symbol] = ticker_id
-            database.add_mention(
-                conn,
-                ticker_id,
-                subreddit_id,
-                submission.id,
-                "post",
-                int(submission.created_utc),
-                post_sentiment,
-            )
+            database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'post', int(submission.created_utc), post_sentiment)

    # Process comments
    for comment in all_comments:
        comment_sentiment = get_sentiment_score(comment.body)
        if tickers_in_title:
-            # If title has tickers, every comment is a mention for them
            for ticker_symbol in tickers_in_title:
-                ticker_id = ticker_id_cache[ticker_symbol]  # Guaranteed to be in cache
-                database.add_mention(
-                    conn,
-                    ticker_id,
-                    subreddit_id,
-                    submission.id,
-                    "comment",
-                    int(comment.created_utc),
-                    comment_sentiment,
-                )
+                ticker_id = ticker_id_cache[ticker_symbol]
+                database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'comment', int(comment.created_utc), comment_sentiment)
        else:
-            # Otherwise, only direct mentions in comments count
-            tickers_in_comment = set(extract_tickers(comment.body))
+            # If no title tickers, we must scan comments for potential tickers
+            tickers_in_comment = extract_potential_tickers(comment.body)
            if tickers_in_comment:
                all_tickers_found_in_post.update(tickers_in_comment)
                for ticker_symbol in tickers_in_comment:
-                    ticker_id = database.get_or_create_entity(
-                        conn, "tickers", "symbol", ticker_symbol
-                    )
-                    database.add_mention(
-                        conn,
-                        ticker_id,
-                        subreddit_id,
-                        submission.id,
-                        "comment",
-                        int(comment.created_utc),
-                        comment_sentiment,
-                    )
+                    ticker_id = database.get_or_create_entity(conn, 'tickers', 'symbol', ticker_symbol)
+                    database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'comment', int(comment.created_utc), comment_sentiment)

-    # Save deep dive analysis (this is separate from mention counting)
+    # 4. --- Save Deep Dive and Return Tickers for Financial Update ---
+    # (This part is unchanged)
    all_comment_sentiments = [get_sentiment_score(c.body) for c in all_comments]
-    avg_sentiment = (
-        sum(all_comment_sentiments) / len(all_comment_sentiments)
-        if all_comment_sentiments
-        else 0
-    )
+    avg_sentiment = sum(all_comment_sentiments) / len(all_comment_sentiments) if all_comment_sentiments else 0
    post_analysis_data = {
-        "post_id": submission.id,
-        "title": submission.title,
-        "post_url": f"https://reddit.com{submission.permalink}",
-        "subreddit_id": subreddit_id,
-        "post_timestamp": int(submission.created_utc),
-        "comment_count": len(all_comments),
-        "avg_comment_sentiment": avg_sentiment,
+        "post_id": submission.id, "title": submission.title,
+        "post_url": f"https://reddit.com{submission.permalink}", "subreddit_id": subreddit_id,
+        "post_timestamp": int(submission.created_utc), "comment_count": len(all_comments),
+        "avg_comment_sentiment": avg_sentiment
    }
    database.add_or_update_post_analysis(conn, post_analysis_data)
-
+    
    return all_tickers_found_in_post


--- a/rstat_tool/ticker_extractor.py
+++ b/rstat_tool/ticker_extractor.py
@@ -119,23 +119,26 @@ COMMON_WORDS_BLACKLIST = {
    "ZEN", "ZERO", "ZEV"
 }

-def extract_tickers(text):
+def extract_golden_tickers(text):
    """
-    Extracts potential stock tickers from a given piece of text.
-    A ticker is identified as a 1-5 character uppercase word, or a word prefixed with $.
+    Extracts ONLY tickers with a '$' prefix. This is the highest-confidence signal.
+    Returns a set of cleaned ticker symbols (e.g., {'TSLA', 'GME'}).
    """
-    # Regex to find potential tickers:
-    # 1. Words prefixed with $: $AAPL, $TSLA
-    # 2. All-caps words between 1 and 5 characters: GME, AMC
-    ticker_regex = r"\$[A-Z]{1,5}\b|\b[A-Z]{2,5}\b"
+    # Regex to find words prefixed with $: $AAPL, $TSLA
+    ticker_regex = r"\$[A-Z]{1,5}\b"
+    tickers = re.findall(ticker_regex, text)
+    # Clean the tickers by removing the '$' and return as a set
+    return {ticker.replace("$", "").upper() for ticker in tickers}

+def extract_potential_tickers(text):
+    """
+    Extracts potential tickers (all-caps words). This is a lower-confidence signal
+    used as a fallback when no golden tickers are present.
+    Returns a set of cleaned ticker symbols.
+    """
+    # Regex to find all-caps words between 2 and 5 characters: GME, AMC
+    ticker_regex = r"\b[A-Z]{2,5}\b"
    potential_tickers = re.findall(ticker_regex, text)
-
-    # Filter out common words and remove the '$' prefix
-    tickers = []
-    for ticker in potential_tickers:
-        cleaned_ticker = ticker.replace("$", "").upper()
-        if cleaned_ticker not in COMMON_WORDS_BLACKLIST:
-            tickers.append(cleaned_ticker)
-
-    return tickers
+    
+    # Filter out common blacklisted words
+    return {ticker for ticker in potential_tickers if ticker not in COMMON_WORDS_BLACKLIST}