Refactored scraping logic.

2025-07-31 22:23:52 +02:00
parent 2f43380970
commit 8ebaaf8b36
3 changed files with 138 additions and 191 deletions
--- a/rstat_tool/main.py
+++ b/rstat_tool/main.py
@@ -65,75 +65,117 @@ def fetch_financial_data(ticker_symbol):

 def _process_submission(submission, subreddit_id, conn, comment_limit):
    """
-    Processes a single Reddit submission using the "Golden Ticker" logic.
-    - Prioritizes tickers with a '$' prefix.
-    - Falls back to potential tickers only if no '$' tickers are found.
+    Processes a single Reddit submission with a more precise "Golden Ticker" logic.
+    - If a '$' ticker exists anywhere, the entire submission is in "Golden Only" mode.
+    - Falls back to potential tickers only if no '$' tickers are found anywhere.
    """
-    # 1. --- Golden Ticker Discovery ---
-    # First, search the entire post (title and body) for high-confidence '$' tickers.
+    # 1. --- Establish Mode: Golden or Potential ---
+    # Scan the entire submission (title + selftext) to determine the mode.
    post_text_for_discovery = submission.title + " " + submission.selftext
-    golden_tickers = extract_golden_tickers(post_text_for_discovery)
-    
-    tickers_in_title = set()
-    comment_only_tickers = set()
-    all_tickers_found_in_post = set()
+    golden_tickers_in_post = extract_golden_tickers(post_text_for_discovery)

-    # 2. --- Apply Contextual Logic ---
-    if golden_tickers:
-        # --- CASE A: Golden Tickers were found ---
-        log.info(f"  -> Golden Ticker(s) Found: {', '.join(golden_tickers)}. Prioritizing these.")
-        all_tickers_found_in_post.update(golden_tickers)
-        # We only care about which of the golden tickers appeared in the title for the hybrid logic.
-        tickers_in_title = {ticker for ticker in golden_tickers if ticker in extract_golden_tickers(submission.title)}
+    is_golden_mode = bool(golden_tickers_in_post)
+
+    if is_golden_mode:
+        log.info(
+            f"  -> Golden Ticker(s) Found: {', '.join(golden_tickers_in_post)}. Engaging Golden-Only Mode."
+        )
+        # In Golden Mode, we ONLY care about tickers with a '$'.
+        tickers_in_title = extract_golden_tickers(submission.title)
    else:
-        # --- CASE B: No Golden Tickers, fall back to best-guess ---
        log.info("  -> No Golden Tickers. Falling back to potential ticker search.")
-        # Now we search for potential tickers (e.g., 'GME' without a '$')
+        # In Potential Mode, we look for any valid-looking capitalized word.
        tickers_in_title = extract_potential_tickers(submission.title)
-        all_tickers_found_in_post.update(tickers_in_title)

-    # 3. --- Mention Processing (This logic remains the same, but uses our cleanly identified tickers) ---
+    all_tickers_found_in_post = set(tickers_in_title)
    ticker_id_cache = {}
+
+    # 2. --- Process Title Mentions ---
+    if tickers_in_title:
+        log.info(
+            f"  -> Title Mention(s): {', '.join(tickers_in_title)}. Attributing all comments."
+        )
+        post_sentiment = get_sentiment_score(submission.title)
+        for ticker_symbol in tickers_in_title:
+            # All title tickers are saved as 'post' type mentions
+            ticker_id = database.get_or_create_entity(
+                conn, "tickers", "symbol", ticker_symbol
+            )
+            ticker_id_cache[ticker_symbol] = ticker_id
+            database.add_mention(
+                conn,
+                ticker_id,
+                subreddit_id,
+                submission.id,
+                "post",
+                int(submission.created_utc),
+                post_sentiment,
+            )
+
+    # 3. --- Process Comments (Single, Efficient Loop) ---
    submission.comments.replace_more(limit=0)
    all_comments = submission.comments.list()[:comment_limit]

-    # Process title mentions
-    if tickers_in_title:
-        log.info(f"  -> Title Mention(s): {', '.join(tickers_in_title)}. Attributing all comments.")
-        post_sentiment = get_sentiment_score(submission.title)
-        for ticker_symbol in tickers_in_title:
-            ticker_id = database.get_or_create_entity(conn, 'tickers', 'symbol', ticker_symbol)
-            ticker_id_cache[ticker_symbol] = ticker_id
-            database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'post', int(submission.created_utc), post_sentiment)
-
-    # Process comments
    for comment in all_comments:
        comment_sentiment = get_sentiment_score(comment.body)
+
        if tickers_in_title:
+            # If the title had tickers, every comment is a mention for them.
+            # We don't need to scan the comment text for tickers here.
            for ticker_symbol in tickers_in_title:
-                ticker_id = ticker_id_cache[ticker_symbol]
-                database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'comment', int(comment.created_utc), comment_sentiment)
+                ticker_id = ticker_id_cache[ticker_symbol]  # Guaranteed to be in cache
+                database.add_mention(
+                    conn,
+                    ticker_id,
+                    subreddit_id,
+                    submission.id,
+                    "comment",
+                    int(comment.created_utc),
+                    comment_sentiment,
+                )
        else:
-            # If no title tickers, we must scan comments for potential tickers
-            tickers_in_comment = extract_potential_tickers(comment.body)
+            # If no title tickers, we must scan the comment for direct mentions.
+            # The type of ticker we look for depends on the mode.
+            if is_golden_mode:
+                # This case is rare (no golden in title, but some in comments) but important.
+                tickers_in_comment = extract_golden_tickers(comment.body)
+            else:
+                tickers_in_comment = extract_potential_tickers(comment.body)
+
            if tickers_in_comment:
                all_tickers_found_in_post.update(tickers_in_comment)
                for ticker_symbol in tickers_in_comment:
-                    ticker_id = database.get_or_create_entity(conn, 'tickers', 'symbol', ticker_symbol)
-                    database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'comment', int(comment.created_utc), comment_sentiment)
+                    ticker_id = database.get_or_create_entity(
+                        conn, "tickers", "symbol", ticker_symbol
+                    )
+                    database.add_mention(
+                        conn,
+                        ticker_id,
+                        subreddit_id,
+                        submission.id,
+                        "comment",
+                        int(comment.created_utc),
+                        comment_sentiment,
+                    )

-    # 4. --- Save Deep Dive and Return Tickers for Financial Update ---
-    # (This part is unchanged)
+    # 4. --- Save Deep Dive Analysis ---
    all_comment_sentiments = [get_sentiment_score(c.body) for c in all_comments]
-    avg_sentiment = sum(all_comment_sentiments) / len(all_comment_sentiments) if all_comment_sentiments else 0
+    avg_sentiment = (
+        sum(all_comment_sentiments) / len(all_comment_sentiments)
+        if all_comment_sentiments
+        else 0
+    )
    post_analysis_data = {
-        "post_id": submission.id, "title": submission.title,
-        "post_url": f"https://reddit.com{submission.permalink}", "subreddit_id": subreddit_id,
-        "post_timestamp": int(submission.created_utc), "comment_count": len(all_comments),
-        "avg_comment_sentiment": avg_sentiment
+        "post_id": submission.id,
+        "title": submission.title,
+        "post_url": f"https://reddit.com{submission.permalink}",
+        "subreddit_id": subreddit_id,
+        "post_timestamp": int(submission.created_utc),
+        "comment_count": len(all_comments),
+        "avg_comment_sentiment": avg_sentiment,
    }
    database.add_or_update_post_analysis(conn, post_analysis_data)
-    
+
    return all_tickers_found_in_post