Refactored scraping logic.

This commit is contained in:
2025-07-31 22:23:52 +02:00
parent 2f43380970
commit 8ebaaf8b36
3 changed files with 138 additions and 191 deletions

View File

@@ -8,8 +8,8 @@ from .database import (
get_deep_dive_details, get_deep_dive_details,
get_daily_summary_for_subreddit, get_daily_summary_for_subreddit,
get_weekly_summary_for_subreddit, get_weekly_summary_for_subreddit,
get_overall_daily_summary, # Now correctly imported get_overall_daily_summary,
get_overall_weekly_summary, # Now correctly imported get_overall_weekly_summary,
) )
app = Flask(__name__, template_folder='../templates', static_folder='../static') app = Flask(__name__, template_folder='../templates', static_folder='../static')

View File

@@ -2,7 +2,7 @@
import sqlite3 import sqlite3
import time import time
from .ticker_extractor import COMMON_WORDS_BLACKLIST from .ticker_extractor import COMMON_WORDS_BLACKLIST, extract_golden_tickers, extract_potential_tickers
from .logger_setup import logger as log from .logger_setup import logger as log
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
@@ -203,23 +203,6 @@ def get_ticker_info(conn, ticker_id):
return cursor.fetchone() return cursor.fetchone()
def get_week_start_end(for_date):
"""
Calculates the start (Monday, 00:00:00) and end (Sunday, 23:59:59)
of the week that a given date falls into.
Returns two datetime objects.
"""
# Monday is 0, Sunday is 6
start_of_week = for_date - timedelta(days=for_date.weekday())
end_of_week = start_of_week + timedelta(days=6)
# Set time to the very beginning and very end of the day for an inclusive range
start_of_week = start_of_week.replace(hour=0, minute=0, second=0, microsecond=0)
end_of_week = end_of_week.replace(hour=23, minute=59, second=59, microsecond=999999)
return start_of_week, end_of_week
def add_or_update_post_analysis(conn, post_data): def add_or_update_post_analysis(conn, post_data):
""" """
Inserts a new post analysis record or updates an existing one. Inserts a new post analysis record or updates an existing one.
@@ -240,127 +223,16 @@ def add_or_update_post_analysis(conn, post_data):
conn.commit() conn.commit()
def get_overall_summary(limit=10): def get_week_start_end(for_date):
""" """Calculates the start (Monday) and end (Sunday) of the week."""
Gets the top tickers across all subreddits from the LAST 24 HOURS. start_of_week = for_date - timedelta(days=for_date.weekday())
""" end_of_week = start_of_week + timedelta(days=6)
conn = get_db_connection() start_of_week = start_of_week.replace(hour=0, minute=0, second=0, microsecond=0)
one_day_ago = datetime.now(timezone.utc) - timedelta(days=1) end_of_week = end_of_week.replace(hour=23, minute=59, second=59, microsecond=999999)
one_day_ago_timestamp = int(one_day_ago.timestamp()) return start_of_week, end_of_week
query = """
SELECT t.symbol, t.market_cap, t.closing_price, COUNT(m.id) as mention_count,
SUM(CASE WHEN m.mention_sentiment > 0.1 THEN 1 ELSE 0 END) as bullish_mentions,
SUM(CASE WHEN m.mention_sentiment < -0.1 THEN 1 ELSE 0 END) as bearish_mentions,
SUM(CASE WHEN m.mention_sentiment BETWEEN -0.1 AND 0.1 THEN 1 ELSE 0 END) as neutral_mentions
FROM mentions m JOIN tickers t ON m.ticker_id = t.id
WHERE m.mention_timestamp >= ? -- <-- ADDED TIME FILTER
GROUP BY t.symbol, t.market_cap, t.closing_price
ORDER BY mention_count DESC LIMIT ?;
"""
results = conn.execute(query, (one_day_ago_timestamp, limit)).fetchall()
conn.close()
return results
def get_subreddit_summary(subreddit_name, limit=10):
"""
Gets the top tickers for a specific subreddit from the LAST 24 HOURS.
"""
conn = get_db_connection()
one_day_ago = datetime.now(timezone.utc) - timedelta(days=1)
one_day_ago_timestamp = int(one_day_ago.timestamp())
query = """
SELECT t.symbol, t.market_cap, t.closing_price, COUNT(m.id) as mention_count,
SUM(CASE WHEN m.mention_sentiment > 0.1 THEN 1 ELSE 0 END) as bullish_mentions,
SUM(CASE WHEN m.mention_sentiment < -0.1 THEN 1 ELSE 0 END) as bearish_mentions,
SUM(CASE WHEN m.mention_sentiment BETWEEN -0.1 AND 0.1 THEN 1 ELSE 0 END) as neutral_mentions
FROM mentions m JOIN tickers t ON m.ticker_id = t.id JOIN subreddits s ON m.subreddit_id = s.id
WHERE LOWER(s.name) = LOWER(?) AND m.mention_timestamp >= ? -- <-- ADDED TIME FILTER
GROUP BY t.symbol, t.market_cap, t.closing_price
ORDER BY mention_count DESC LIMIT ?;
"""
results = conn.execute(
query, (subreddit_name, one_day_ago_timestamp, limit)
).fetchall()
conn.close()
return results
def get_daily_summary_for_subreddit(subreddit_name):
"""Gets a summary for the DAILY image view (last 24 hours)."""
conn = get_db_connection()
one_day_ago = datetime.now(timezone.utc) - timedelta(days=1)
one_day_ago_timestamp = int(one_day_ago.timestamp())
query = """
SELECT
t.symbol, t.market_cap, t.closing_price,
COUNT(m.id) as total_mentions,
COUNT(CASE WHEN m.mention_sentiment > 0.1 THEN 1 END) as bullish_mentions,
COUNT(CASE WHEN m.mention_sentiment < -0.1 THEN 1 END) as bearish_mentions
FROM mentions m JOIN tickers t ON m.ticker_id = t.id JOIN subreddits s ON m.subreddit_id = s.id
WHERE LOWER(s.name) = LOWER(?) AND m.mention_timestamp >= ?
GROUP BY t.symbol, t.market_cap, t.closing_price
ORDER BY total_mentions DESC LIMIT 10;
"""
results = conn.execute(query, (subreddit_name, one_day_ago_timestamp)).fetchall()
conn.close()
return results
def get_weekly_summary_for_subreddit(subreddit_name, for_date):
"""Gets a summary for the WEEKLY image view (full week)."""
conn = get_db_connection()
start_of_week, end_of_week = get_week_start_end(for_date)
start_timestamp = int(start_of_week.timestamp())
end_timestamp = int(end_of_week.timestamp())
query = """
SELECT
t.symbol, t.market_cap, t.closing_price,
COUNT(m.id) as total_mentions,
COUNT(CASE WHEN m.mention_sentiment > 0.1 THEN 1 END) as bullish_mentions,
COUNT(CASE WHEN m.mention_sentiment < -0.1 THEN 1 END) as bearish_mentions
FROM mentions m JOIN tickers t ON m.ticker_id = t.id JOIN subreddits s ON m.subreddit_id = s.id
WHERE LOWER(s.name) = LOWER(?) AND m.mention_timestamp BETWEEN ? AND ?
GROUP BY t.symbol, t.market_cap, t.closing_price
ORDER BY total_mentions DESC LIMIT 10;
"""
results = conn.execute(
query, (subreddit_name, start_timestamp, end_timestamp)
).fetchall()
conn.close()
return results, start_of_week, end_of_week
def get_overall_image_view_summary():
"""
Gets a summary of top tickers across ALL subreddits for the DAILY image view (last 24 hours).
"""
conn = get_db_connection()
one_day_ago = datetime.now(timezone.utc) - timedelta(days=1)
one_day_ago_timestamp = int(one_day_ago.timestamp())
query = """
SELECT
t.symbol, t.market_cap, t.closing_price,
COUNT(m.id) as total_mentions,
COUNT(CASE WHEN m.mention_sentiment > 0.1 THEN 1 END) as bullish_mentions,
COUNT(CASE WHEN m.mention_sentiment < -0.1 THEN 1 END) as bearish_mentions
FROM mentions m JOIN tickers t ON m.ticker_id = t.id
WHERE m.mention_timestamp >= ? -- <-- ADDED TIME FILTER
GROUP BY t.symbol, t.market_cap, t.closing_price
ORDER BY total_mentions DESC LIMIT 10;
"""
results = conn.execute(query, (one_day_ago_timestamp,)).fetchall()
conn.close()
return results
def get_overall_daily_summary(): def get_overall_daily_summary():
""" """Gets the top tickers across all subreddits from the LAST 24 HOURS."""
Gets the top tickers across all subreddits from the LAST 24 HOURS.
(This is a copy of get_overall_summary, renamed for clarity).
"""
conn = get_db_connection() conn = get_db_connection()
one_day_ago = datetime.now(timezone.utc) - timedelta(days=1) one_day_ago = datetime.now(timezone.utc) - timedelta(days=1)
one_day_ago_timestamp = int(one_day_ago.timestamp()) one_day_ago_timestamp = int(one_day_ago.timestamp())
@@ -377,16 +249,12 @@ def get_overall_daily_summary():
conn.close() conn.close()
return results return results
def get_overall_weekly_summary(): def get_overall_weekly_summary():
""" """Gets the top tickers across all subreddits for LAST WEEK (Mon-Sun)."""
Gets the top tickers across all subreddits for the LAST 7 DAYS.
"""
conn = get_db_connection() conn = get_db_connection()
today = datetime.now(timezone.utc) today = datetime.now(timezone.utc)
start_of_week, end_of_week = get_week_start_end( target_date_for_last_week = today - timedelta(days=7)
today - timedelta(days=7) start_of_week, end_of_week = get_week_start_end(target_date_for_last_week)
) # Get last week's boundaries
start_timestamp = int(start_of_week.timestamp()) start_timestamp = int(start_of_week.timestamp())
end_timestamp = int(end_of_week.timestamp()) end_timestamp = int(end_of_week.timestamp())
query = """ query = """
@@ -402,6 +270,43 @@ def get_overall_weekly_summary():
conn.close() conn.close()
return results, start_of_week, end_of_week return results, start_of_week, end_of_week
def get_daily_summary_for_subreddit(subreddit_name):
"""Gets a summary for a subreddit's DAILY view (last 24 hours)."""
conn = get_db_connection()
one_day_ago = datetime.now(timezone.utc) - timedelta(days=1)
one_day_ago_timestamp = int(one_day_ago.timestamp())
query = """
SELECT t.symbol, t.market_cap, t.closing_price, COUNT(m.id) as total_mentions,
SUM(CASE WHEN m.mention_sentiment > 0.1 THEN 1 ELSE 0 END) as bullish_mentions,
SUM(CASE WHEN m.mention_sentiment < -0.1 THEN 1 ELSE 0 END) as bearish_mentions
FROM mentions m JOIN tickers t ON m.ticker_id = t.id JOIN subreddits s ON m.subreddit_id = s.id
WHERE LOWER(s.name) = LOWER(?) AND m.mention_timestamp >= ?
GROUP BY t.symbol, t.market_cap, t.closing_price
ORDER BY total_mentions DESC LIMIT 10;
"""
results = conn.execute(query, (subreddit_name, one_day_ago_timestamp)).fetchall()
conn.close()
return results
def get_weekly_summary_for_subreddit(subreddit_name, for_date):
"""Gets a summary for a subreddit's WEEKLY view (for a specific week)."""
conn = get_db_connection()
start_of_week, end_of_week = get_week_start_end(for_date)
start_timestamp = int(start_of_week.timestamp())
end_timestamp = int(end_of_week.timestamp())
query = """
SELECT t.symbol, t.market_cap, t.closing_price, COUNT(m.id) as total_mentions,
SUM(CASE WHEN m.mention_sentiment > 0.1 THEN 1 ELSE 0 END) as bullish_mentions,
SUM(CASE WHEN m.mention_sentiment < -0.1 THEN 1 ELSE 0 END) as bearish_mentions
FROM mentions m JOIN tickers t ON m.ticker_id = t.id JOIN subreddits s ON m.subreddit_id = s.id
WHERE LOWER(s.name) = LOWER(?) AND m.mention_timestamp BETWEEN ? AND ?
GROUP BY t.symbol, t.market_cap, t.closing_price
ORDER BY total_mentions DESC LIMIT 10;
"""
results = conn.execute(query, (subreddit_name, start_timestamp, end_timestamp)).fetchall()
conn.close()
return results, start_of_week, end_of_week
def get_deep_dive_details(ticker_symbol): def get_deep_dive_details(ticker_symbol):
"""Gets all analyzed posts that mention a specific ticker.""" """Gets all analyzed posts that mention a specific ticker."""

View File

@@ -65,72 +65,114 @@ def fetch_financial_data(ticker_symbol):
def _process_submission(submission, subreddit_id, conn, comment_limit): def _process_submission(submission, subreddit_id, conn, comment_limit):
""" """
Processes a single Reddit submission using the "Golden Ticker" logic. Processes a single Reddit submission with a more precise "Golden Ticker" logic.
- Prioritizes tickers with a '$' prefix. - If a '$' ticker exists anywhere, the entire submission is in "Golden Only" mode.
- Falls back to potential tickers only if no '$' tickers are found. - Falls back to potential tickers only if no '$' tickers are found anywhere.
""" """
# 1. --- Golden Ticker Discovery --- # 1. --- Establish Mode: Golden or Potential ---
# First, search the entire post (title and body) for high-confidence '$' tickers. # Scan the entire submission (title + selftext) to determine the mode.
post_text_for_discovery = submission.title + " " + submission.selftext post_text_for_discovery = submission.title + " " + submission.selftext
golden_tickers = extract_golden_tickers(post_text_for_discovery) golden_tickers_in_post = extract_golden_tickers(post_text_for_discovery)
tickers_in_title = set() is_golden_mode = bool(golden_tickers_in_post)
comment_only_tickers = set()
all_tickers_found_in_post = set()
# 2. --- Apply Contextual Logic --- if is_golden_mode:
if golden_tickers: log.info(
# --- CASE A: Golden Tickers were found --- f" -> Golden Ticker(s) Found: {', '.join(golden_tickers_in_post)}. Engaging Golden-Only Mode."
log.info(f" -> Golden Ticker(s) Found: {', '.join(golden_tickers)}. Prioritizing these.") )
all_tickers_found_in_post.update(golden_tickers) # In Golden Mode, we ONLY care about tickers with a '$'.
# We only care about which of the golden tickers appeared in the title for the hybrid logic. tickers_in_title = extract_golden_tickers(submission.title)
tickers_in_title = {ticker for ticker in golden_tickers if ticker in extract_golden_tickers(submission.title)}
else: else:
# --- CASE B: No Golden Tickers, fall back to best-guess ---
log.info(" -> No Golden Tickers. Falling back to potential ticker search.") log.info(" -> No Golden Tickers. Falling back to potential ticker search.")
# Now we search for potential tickers (e.g., 'GME' without a '$') # In Potential Mode, we look for any valid-looking capitalized word.
tickers_in_title = extract_potential_tickers(submission.title) tickers_in_title = extract_potential_tickers(submission.title)
all_tickers_found_in_post.update(tickers_in_title)
# 3. --- Mention Processing (This logic remains the same, but uses our cleanly identified tickers) --- all_tickers_found_in_post = set(tickers_in_title)
ticker_id_cache = {} ticker_id_cache = {}
# 2. --- Process Title Mentions ---
if tickers_in_title:
log.info(
f" -> Title Mention(s): {', '.join(tickers_in_title)}. Attributing all comments."
)
post_sentiment = get_sentiment_score(submission.title)
for ticker_symbol in tickers_in_title:
# All title tickers are saved as 'post' type mentions
ticker_id = database.get_or_create_entity(
conn, "tickers", "symbol", ticker_symbol
)
ticker_id_cache[ticker_symbol] = ticker_id
database.add_mention(
conn,
ticker_id,
subreddit_id,
submission.id,
"post",
int(submission.created_utc),
post_sentiment,
)
# 3. --- Process Comments (Single, Efficient Loop) ---
submission.comments.replace_more(limit=0) submission.comments.replace_more(limit=0)
all_comments = submission.comments.list()[:comment_limit] all_comments = submission.comments.list()[:comment_limit]
# Process title mentions
if tickers_in_title:
log.info(f" -> Title Mention(s): {', '.join(tickers_in_title)}. Attributing all comments.")
post_sentiment = get_sentiment_score(submission.title)
for ticker_symbol in tickers_in_title:
ticker_id = database.get_or_create_entity(conn, 'tickers', 'symbol', ticker_symbol)
ticker_id_cache[ticker_symbol] = ticker_id
database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'post', int(submission.created_utc), post_sentiment)
# Process comments
for comment in all_comments: for comment in all_comments:
comment_sentiment = get_sentiment_score(comment.body) comment_sentiment = get_sentiment_score(comment.body)
if tickers_in_title: if tickers_in_title:
# If the title had tickers, every comment is a mention for them.
# We don't need to scan the comment text for tickers here.
for ticker_symbol in tickers_in_title: for ticker_symbol in tickers_in_title:
ticker_id = ticker_id_cache[ticker_symbol] ticker_id = ticker_id_cache[ticker_symbol] # Guaranteed to be in cache
database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'comment', int(comment.created_utc), comment_sentiment) database.add_mention(
conn,
ticker_id,
subreddit_id,
submission.id,
"comment",
int(comment.created_utc),
comment_sentiment,
)
else: else:
# If no title tickers, we must scan comments for potential tickers # If no title tickers, we must scan the comment for direct mentions.
tickers_in_comment = extract_potential_tickers(comment.body) # The type of ticker we look for depends on the mode.
if is_golden_mode:
# This case is rare (no golden in title, but some in comments) but important.
tickers_in_comment = extract_golden_tickers(comment.body)
else:
tickers_in_comment = extract_potential_tickers(comment.body)
if tickers_in_comment: if tickers_in_comment:
all_tickers_found_in_post.update(tickers_in_comment) all_tickers_found_in_post.update(tickers_in_comment)
for ticker_symbol in tickers_in_comment: for ticker_symbol in tickers_in_comment:
ticker_id = database.get_or_create_entity(conn, 'tickers', 'symbol', ticker_symbol) ticker_id = database.get_or_create_entity(
database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'comment', int(comment.created_utc), comment_sentiment) conn, "tickers", "symbol", ticker_symbol
)
database.add_mention(
conn,
ticker_id,
subreddit_id,
submission.id,
"comment",
int(comment.created_utc),
comment_sentiment,
)
# 4. --- Save Deep Dive and Return Tickers for Financial Update --- # 4. --- Save Deep Dive Analysis ---
# (This part is unchanged)
all_comment_sentiments = [get_sentiment_score(c.body) for c in all_comments] all_comment_sentiments = [get_sentiment_score(c.body) for c in all_comments]
avg_sentiment = sum(all_comment_sentiments) / len(all_comment_sentiments) if all_comment_sentiments else 0 avg_sentiment = (
sum(all_comment_sentiments) / len(all_comment_sentiments)
if all_comment_sentiments
else 0
)
post_analysis_data = { post_analysis_data = {
"post_id": submission.id, "title": submission.title, "post_id": submission.id,
"post_url": f"https://reddit.com{submission.permalink}", "subreddit_id": subreddit_id, "title": submission.title,
"post_timestamp": int(submission.created_utc), "comment_count": len(all_comments), "post_url": f"https://reddit.com{submission.permalink}",
"avg_comment_sentiment": avg_sentiment "subreddit_id": subreddit_id,
"post_timestamp": int(submission.created_utc),
"comment_count": len(all_comments),
"avg_comment_sentiment": avg_sentiment,
} }
database.add_or_update_post_analysis(conn, post_analysis_data) database.add_or_update_post_analysis(conn, post_analysis_data)