From 8ebaaf8b36c36f183af7dd9b387b53773d539d71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A5l-Kristian=20Hamre?= <pkhamre@gmail.com>
Date: Thu, 31 Jul 2025 22:23:52 +0200
Subject: [PATCH] Refactored scraping logic.

---
 rstat_tool/dashboard.py |   4 +-
 rstat_tool/database.py  | 193 ++++++++++------------------------------
 rstat_tool/main.py      | 132 +++++++++++++++++----------
 3 files changed, 138 insertions(+), 191 deletions(-)

diff --git a/rstat_tool/dashboard.py b/rstat_tool/dashboard.py
index 39abb42..1241d22 100644
--- a/rstat_tool/dashboard.py
+++ b/rstat_tool/dashboard.py
@@ -8,8 +8,8 @@ from .database import (
     get_deep_dive_details,
     get_daily_summary_for_subreddit,
     get_weekly_summary_for_subreddit,
-    get_overall_daily_summary,  # Now correctly imported
-    get_overall_weekly_summary,  # Now correctly imported
+    get_overall_daily_summary,
+    get_overall_weekly_summary,
 )
 
 app = Flask(__name__, template_folder='../templates', static_folder='../static')
diff --git a/rstat_tool/database.py b/rstat_tool/database.py
index bd4d126..9df9d25 100644
--- a/rstat_tool/database.py
+++ b/rstat_tool/database.py
@@ -2,7 +2,7 @@
 
 import sqlite3
 import time
-from .ticker_extractor import COMMON_WORDS_BLACKLIST
+from .ticker_extractor import COMMON_WORDS_BLACKLIST, extract_golden_tickers, extract_potential_tickers
 from .logger_setup import logger as log
 from datetime import datetime, timedelta, timezone
 
@@ -203,23 +203,6 @@ def get_ticker_info(conn, ticker_id):
     return cursor.fetchone()
 
 
-def get_week_start_end(for_date):
-    """
-    Calculates the start (Monday, 00:00:00) and end (Sunday, 23:59:59)
-    of the week that a given date falls into.
-    Returns two datetime objects.
-    """
-    # Monday is 0, Sunday is 6
-    start_of_week = for_date - timedelta(days=for_date.weekday())
-    end_of_week = start_of_week + timedelta(days=6)
-
-    # Set time to the very beginning and very end of the day for an inclusive range
-    start_of_week = start_of_week.replace(hour=0, minute=0, second=0, microsecond=0)
-    end_of_week = end_of_week.replace(hour=23, minute=59, second=59, microsecond=999999)
-
-    return start_of_week, end_of_week
-
-
 def add_or_update_post_analysis(conn, post_data):
     """
     Inserts a new post analysis record or updates an existing one.
@@ -240,127 +223,16 @@ def add_or_update_post_analysis(conn, post_data):
     conn.commit()
 
 
-def get_overall_summary(limit=10):
-    """
-    Gets the top tickers across all subreddits from the LAST 24 HOURS.
-    """
-    conn = get_db_connection()
-    one_day_ago = datetime.now(timezone.utc) - timedelta(days=1)
-    one_day_ago_timestamp = int(one_day_ago.timestamp())
-
-    query = """
-        SELECT t.symbol, t.market_cap, t.closing_price, COUNT(m.id) as mention_count,
-            SUM(CASE WHEN m.mention_sentiment > 0.1 THEN 1 ELSE 0 END) as bullish_mentions,
-            SUM(CASE WHEN m.mention_sentiment < -0.1 THEN 1 ELSE 0 END) as bearish_mentions,
-            SUM(CASE WHEN m.mention_sentiment BETWEEN -0.1 AND 0.1 THEN 1 ELSE 0 END) as neutral_mentions
-        FROM mentions m JOIN tickers t ON m.ticker_id = t.id
-        WHERE m.mention_timestamp >= ? -- <-- ADDED TIME FILTER
-        GROUP BY t.symbol, t.market_cap, t.closing_price
-        ORDER BY mention_count DESC LIMIT ?;
-    """
-    results = conn.execute(query, (one_day_ago_timestamp, limit)).fetchall()
-    conn.close()
-    return results
-
-
-def get_subreddit_summary(subreddit_name, limit=10):
-    """
-    Gets the top tickers for a specific subreddit from the LAST 24 HOURS.
-    """
-    conn = get_db_connection()
-    one_day_ago = datetime.now(timezone.utc) - timedelta(days=1)
-    one_day_ago_timestamp = int(one_day_ago.timestamp())
-
-    query = """
-        SELECT t.symbol, t.market_cap, t.closing_price, COUNT(m.id) as mention_count,
-            SUM(CASE WHEN m.mention_sentiment > 0.1 THEN 1 ELSE 0 END) as bullish_mentions,
-            SUM(CASE WHEN m.mention_sentiment < -0.1 THEN 1 ELSE 0 END) as bearish_mentions,
-            SUM(CASE WHEN m.mention_sentiment BETWEEN -0.1 AND 0.1 THEN 1 ELSE 0 END) as neutral_mentions
-        FROM mentions m JOIN tickers t ON m.ticker_id = t.id JOIN subreddits s ON m.subreddit_id = s.id
-        WHERE LOWER(s.name) = LOWER(?) AND m.mention_timestamp >= ? -- <-- ADDED TIME FILTER
-        GROUP BY t.symbol, t.market_cap, t.closing_price
-        ORDER BY mention_count DESC LIMIT ?;
-    """
-    results = conn.execute(
-        query, (subreddit_name, one_day_ago_timestamp, limit)
-    ).fetchall()
-    conn.close()
-    return results
-
-
-def get_daily_summary_for_subreddit(subreddit_name):
-    """Gets a summary for the DAILY image view (last 24 hours)."""
-    conn = get_db_connection()
-    one_day_ago = datetime.now(timezone.utc) - timedelta(days=1)
-    one_day_ago_timestamp = int(one_day_ago.timestamp())
-    query = """
-        SELECT
-            t.symbol, t.market_cap, t.closing_price,
-            COUNT(m.id) as total_mentions,
-            COUNT(CASE WHEN m.mention_sentiment > 0.1 THEN 1 END) as bullish_mentions,
-            COUNT(CASE WHEN m.mention_sentiment < -0.1 THEN 1 END) as bearish_mentions
-        FROM mentions m JOIN tickers t ON m.ticker_id = t.id JOIN subreddits s ON m.subreddit_id = s.id
-        WHERE LOWER(s.name) = LOWER(?) AND m.mention_timestamp >= ?
-        GROUP BY t.symbol, t.market_cap, t.closing_price
-        ORDER BY total_mentions DESC LIMIT 10;
-    """
-    results = conn.execute(query, (subreddit_name, one_day_ago_timestamp)).fetchall()
-    conn.close()
-    return results
-
-
-def get_weekly_summary_for_subreddit(subreddit_name, for_date):
-    """Gets a summary for the WEEKLY image view (full week)."""
-    conn = get_db_connection()
-    start_of_week, end_of_week = get_week_start_end(for_date)
-    start_timestamp = int(start_of_week.timestamp())
-    end_timestamp = int(end_of_week.timestamp())
-    query = """
-        SELECT
-            t.symbol, t.market_cap, t.closing_price,
-            COUNT(m.id) as total_mentions,
-            COUNT(CASE WHEN m.mention_sentiment > 0.1 THEN 1 END) as bullish_mentions,
-            COUNT(CASE WHEN m.mention_sentiment < -0.1 THEN 1 END) as bearish_mentions
-        FROM mentions m JOIN tickers t ON m.ticker_id = t.id JOIN subreddits s ON m.subreddit_id = s.id
-        WHERE LOWER(s.name) = LOWER(?) AND m.mention_timestamp BETWEEN ? AND ?
-        GROUP BY t.symbol, t.market_cap, t.closing_price
-        ORDER BY total_mentions DESC LIMIT 10;
-    """
-    results = conn.execute(
-        query, (subreddit_name, start_timestamp, end_timestamp)
-    ).fetchall()
-    conn.close()
-    return results, start_of_week, end_of_week
-
-
-def get_overall_image_view_summary():
-    """
-    Gets a summary of top tickers across ALL subreddits for the DAILY image view (last 24 hours).
-    """
-    conn = get_db_connection()
-    one_day_ago = datetime.now(timezone.utc) - timedelta(days=1)
-    one_day_ago_timestamp = int(one_day_ago.timestamp())
-    query = """
-        SELECT
-            t.symbol, t.market_cap, t.closing_price,
-            COUNT(m.id) as total_mentions,
-            COUNT(CASE WHEN m.mention_sentiment > 0.1 THEN 1 END) as bullish_mentions,
-            COUNT(CASE WHEN m.mention_sentiment < -0.1 THEN 1 END) as bearish_mentions
-        FROM mentions m JOIN tickers t ON m.ticker_id = t.id
-        WHERE m.mention_timestamp >= ? -- <-- ADDED TIME FILTER
-        GROUP BY t.symbol, t.market_cap, t.closing_price
-        ORDER BY total_mentions DESC LIMIT 10;
-    """
-    results = conn.execute(query, (one_day_ago_timestamp,)).fetchall()
-    conn.close()
-    return results
-
+def get_week_start_end(for_date):
+    """Calculates the start (Monday) and end (Sunday) of the week."""
+    start_of_week = for_date - timedelta(days=for_date.weekday())
+    end_of_week = start_of_week + timedelta(days=6)
+    start_of_week = start_of_week.replace(hour=0, minute=0, second=0, microsecond=0)
+    end_of_week = end_of_week.replace(hour=23, minute=59, second=59, microsecond=999999)
+    return start_of_week, end_of_week
 
 def get_overall_daily_summary():
-    """
-    Gets the top tickers across all subreddits from the LAST 24 HOURS.
-    (This is a copy of get_overall_summary, renamed for clarity).
-    """
+    """Gets the top tickers across all subreddits from the LAST 24 HOURS."""
     conn = get_db_connection()
     one_day_ago = datetime.now(timezone.utc) - timedelta(days=1)
     one_day_ago_timestamp = int(one_day_ago.timestamp())
@@ -377,16 +249,12 @@ def get_overall_daily_summary():
     conn.close()
     return results
 
-
 def get_overall_weekly_summary():
-    """
-    Gets the top tickers across all subreddits for the LAST 7 DAYS.
-    """
+    """Gets the top tickers across all subreddits for LAST WEEK (Mon-Sun)."""
     conn = get_db_connection()
     today = datetime.now(timezone.utc)
-    start_of_week, end_of_week = get_week_start_end(
-        today - timedelta(days=7)
-    )  # Get last week's boundaries
+    target_date_for_last_week = today - timedelta(days=7)
+    start_of_week, end_of_week = get_week_start_end(target_date_for_last_week)
     start_timestamp = int(start_of_week.timestamp())
     end_timestamp = int(end_of_week.timestamp())
     query = """
@@ -402,6 +270,43 @@ def get_overall_weekly_summary():
     conn.close()
     return results, start_of_week, end_of_week
 
+def get_daily_summary_for_subreddit(subreddit_name):
+    """Gets a summary for a subreddit's DAILY view (last 24 hours)."""
+    conn = get_db_connection()
+    one_day_ago = datetime.now(timezone.utc) - timedelta(days=1)
+    one_day_ago_timestamp = int(one_day_ago.timestamp())
+    query = """
+        SELECT t.symbol, t.market_cap, t.closing_price, COUNT(m.id) as total_mentions,
+            SUM(CASE WHEN m.mention_sentiment > 0.1 THEN 1 ELSE 0 END) as bullish_mentions,
+            SUM(CASE WHEN m.mention_sentiment < -0.1 THEN 1 ELSE 0 END) as bearish_mentions
+        FROM mentions m JOIN tickers t ON m.ticker_id = t.id JOIN subreddits s ON m.subreddit_id = s.id
+        WHERE LOWER(s.name) = LOWER(?) AND m.mention_timestamp >= ?
+        GROUP BY t.symbol, t.market_cap, t.closing_price
+        ORDER BY total_mentions DESC LIMIT 10;
+    """
+    results = conn.execute(query, (subreddit_name, one_day_ago_timestamp)).fetchall()
+    conn.close()
+    return results
+
+def get_weekly_summary_for_subreddit(subreddit_name, for_date):
+    """Gets a summary for a subreddit's WEEKLY view (for a specific week)."""
+    conn = get_db_connection()
+    start_of_week, end_of_week = get_week_start_end(for_date)
+    start_timestamp = int(start_of_week.timestamp())
+    end_timestamp = int(end_of_week.timestamp())
+    query = """
+        SELECT t.symbol, t.market_cap, t.closing_price, COUNT(m.id) as total_mentions,
+            SUM(CASE WHEN m.mention_sentiment > 0.1 THEN 1 ELSE 0 END) as bullish_mentions,
+            SUM(CASE WHEN m.mention_sentiment < -0.1 THEN 1 ELSE 0 END) as bearish_mentions
+        FROM mentions m JOIN tickers t ON m.ticker_id = t.id JOIN subreddits s ON m.subreddit_id = s.id
+        WHERE LOWER(s.name) = LOWER(?) AND m.mention_timestamp BETWEEN ? AND ?
+        GROUP BY t.symbol, t.market_cap, t.closing_price
+        ORDER BY total_mentions DESC LIMIT 10;
+    """
+    results = conn.execute(query, (subreddit_name, start_timestamp, end_timestamp)).fetchall()
+    conn.close()
+    return results, start_of_week, end_of_week
+
 
 def get_deep_dive_details(ticker_symbol):
     """Gets all analyzed posts that mention a specific ticker."""
diff --git a/rstat_tool/main.py b/rstat_tool/main.py
index 5437030..ed969da 100644
--- a/rstat_tool/main.py
+++ b/rstat_tool/main.py
@@ -65,75 +65,117 @@ def fetch_financial_data(ticker_symbol):
 
 def _process_submission(submission, subreddit_id, conn, comment_limit):
     """
-    Processes a single Reddit submission using the "Golden Ticker" logic.
-    - Prioritizes tickers with a '$' prefix.
-    - Falls back to potential tickers only if no '$' tickers are found.
+    Processes a single Reddit submission with a more precise "Golden Ticker" logic.
+    - If a '$' ticker exists anywhere, the entire submission is in "Golden Only" mode.
+    - Falls back to potential tickers only if no '$' tickers are found anywhere.
     """
-    # 1. --- Golden Ticker Discovery ---
-    # First, search the entire post (title and body) for high-confidence '$' tickers.
+    # 1. --- Establish Mode: Golden or Potential ---
+    # Scan the entire submission (title + selftext) to determine the mode.
     post_text_for_discovery = submission.title + " " + submission.selftext
-    golden_tickers = extract_golden_tickers(post_text_for_discovery)
-    
-    tickers_in_title = set()
-    comment_only_tickers = set()
-    all_tickers_found_in_post = set()
+    golden_tickers_in_post = extract_golden_tickers(post_text_for_discovery)
 
-    # 2. --- Apply Contextual Logic ---
-    if golden_tickers:
-        # --- CASE A: Golden Tickers were found ---
-        log.info(f"  -> Golden Ticker(s) Found: {', '.join(golden_tickers)}. Prioritizing these.")
-        all_tickers_found_in_post.update(golden_tickers)
-        # We only care about which of the golden tickers appeared in the title for the hybrid logic.
-        tickers_in_title = {ticker for ticker in golden_tickers if ticker in extract_golden_tickers(submission.title)}
+    is_golden_mode = bool(golden_tickers_in_post)
+
+    if is_golden_mode:
+        log.info(
+            f"  -> Golden Ticker(s) Found: {', '.join(golden_tickers_in_post)}. Engaging Golden-Only Mode."
+        )
+        # In Golden Mode, we ONLY care about tickers with a '$'.
+        tickers_in_title = extract_golden_tickers(submission.title)
     else:
-        # --- CASE B: No Golden Tickers, fall back to best-guess ---
         log.info("  -> No Golden Tickers. Falling back to potential ticker search.")
-        # Now we search for potential tickers (e.g., 'GME' without a '$')
+        # In Potential Mode, we look for any valid-looking capitalized word.
         tickers_in_title = extract_potential_tickers(submission.title)
-        all_tickers_found_in_post.update(tickers_in_title)
 
-    # 3. --- Mention Processing (This logic remains the same, but uses our cleanly identified tickers) ---
+    all_tickers_found_in_post = set(tickers_in_title)
     ticker_id_cache = {}
+
+    # 2. --- Process Title Mentions ---
+    if tickers_in_title:
+        log.info(
+            f"  -> Title Mention(s): {', '.join(tickers_in_title)}. Attributing all comments."
+        )
+        post_sentiment = get_sentiment_score(submission.title)
+        for ticker_symbol in tickers_in_title:
+            # All title tickers are saved as 'post' type mentions
+            ticker_id = database.get_or_create_entity(
+                conn, "tickers", "symbol", ticker_symbol
+            )
+            ticker_id_cache[ticker_symbol] = ticker_id
+            database.add_mention(
+                conn,
+                ticker_id,
+                subreddit_id,
+                submission.id,
+                "post",
+                int(submission.created_utc),
+                post_sentiment,
+            )
+
+    # 3. --- Process Comments (Single, Efficient Loop) ---
     submission.comments.replace_more(limit=0)
     all_comments = submission.comments.list()[:comment_limit]
 
-    # Process title mentions
-    if tickers_in_title:
-        log.info(f"  -> Title Mention(s): {', '.join(tickers_in_title)}. Attributing all comments.")
-        post_sentiment = get_sentiment_score(submission.title)
-        for ticker_symbol in tickers_in_title:
-            ticker_id = database.get_or_create_entity(conn, 'tickers', 'symbol', ticker_symbol)
-            ticker_id_cache[ticker_symbol] = ticker_id
-            database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'post', int(submission.created_utc), post_sentiment)
-
-    # Process comments
     for comment in all_comments:
         comment_sentiment = get_sentiment_score(comment.body)
+
         if tickers_in_title:
+            # If the title had tickers, every comment is a mention for them.
+            # We don't need to scan the comment text for tickers here.
             for ticker_symbol in tickers_in_title:
-                ticker_id = ticker_id_cache[ticker_symbol]
-                database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'comment', int(comment.created_utc), comment_sentiment)
+                ticker_id = ticker_id_cache[ticker_symbol]  # Guaranteed to be in cache
+                database.add_mention(
+                    conn,
+                    ticker_id,
+                    subreddit_id,
+                    submission.id,
+                    "comment",
+                    int(comment.created_utc),
+                    comment_sentiment,
+                )
         else:
-            # If no title tickers, we must scan comments for potential tickers
-            tickers_in_comment = extract_potential_tickers(comment.body)
+            # If no title tickers, we must scan the comment for direct mentions.
+            # The type of ticker we look for depends on the mode.
+            if is_golden_mode:
+                # This case is rare (no golden in title, but some in comments) but important.
+                tickers_in_comment = extract_golden_tickers(comment.body)
+            else:
+                tickers_in_comment = extract_potential_tickers(comment.body)
+
             if tickers_in_comment:
                 all_tickers_found_in_post.update(tickers_in_comment)
                 for ticker_symbol in tickers_in_comment:
-                    ticker_id = database.get_or_create_entity(conn, 'tickers', 'symbol', ticker_symbol)
-                    database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'comment', int(comment.created_utc), comment_sentiment)
+                    ticker_id = database.get_or_create_entity(
+                        conn, "tickers", "symbol", ticker_symbol
+                    )
+                    database.add_mention(
+                        conn,
+                        ticker_id,
+                        subreddit_id,
+                        submission.id,
+                        "comment",
+                        int(comment.created_utc),
+                        comment_sentiment,
+                    )
 
-    # 4. --- Save Deep Dive and Return Tickers for Financial Update ---
-    # (This part is unchanged)
+    # 4. --- Save Deep Dive Analysis ---
     all_comment_sentiments = [get_sentiment_score(c.body) for c in all_comments]
-    avg_sentiment = sum(all_comment_sentiments) / len(all_comment_sentiments) if all_comment_sentiments else 0
+    avg_sentiment = (
+        sum(all_comment_sentiments) / len(all_comment_sentiments)
+        if all_comment_sentiments
+        else 0
+    )
     post_analysis_data = {
-        "post_id": submission.id, "title": submission.title,
-        "post_url": f"https://reddit.com{submission.permalink}", "subreddit_id": subreddit_id,
-        "post_timestamp": int(submission.created_utc), "comment_count": len(all_comments),
-        "avg_comment_sentiment": avg_sentiment
+        "post_id": submission.id,
+        "title": submission.title,
+        "post_url": f"https://reddit.com{submission.permalink}",
+        "subreddit_id": subreddit_id,
+        "post_timestamp": int(submission.created_utc),
+        "comment_count": len(all_comments),
+        "avg_comment_sentiment": avg_sentiment,
     }
     database.add_or_update_post_analysis(conn, post_analysis_data)
-    
+
     return all_tickers_found_in_post