Refactored scraping logic.

This commit is contained in:
2025-07-31 22:23:52 +02:00
parent 2f43380970
commit 8ebaaf8b36
3 changed files with 138 additions and 191 deletions

View File

@@ -65,75 +65,117 @@ def fetch_financial_data(ticker_symbol):
def _process_submission(submission, subreddit_id, conn, comment_limit):
"""
Processes a single Reddit submission using the "Golden Ticker" logic.
- Prioritizes tickers with a '$' prefix.
- Falls back to potential tickers only if no '$' tickers are found.
Processes a single Reddit submission with a more precise "Golden Ticker" logic.
- If a '$' ticker exists anywhere, the entire submission is in "Golden Only" mode.
- Falls back to potential tickers only if no '$' tickers are found anywhere.
"""
# 1. --- Golden Ticker Discovery ---
# First, search the entire post (title and body) for high-confidence '$' tickers.
# 1. --- Establish Mode: Golden or Potential ---
# Scan the entire submission (title + selftext) to determine the mode.
post_text_for_discovery = submission.title + " " + submission.selftext
golden_tickers = extract_golden_tickers(post_text_for_discovery)
tickers_in_title = set()
comment_only_tickers = set()
all_tickers_found_in_post = set()
golden_tickers_in_post = extract_golden_tickers(post_text_for_discovery)
# 2. --- Apply Contextual Logic ---
if golden_tickers:
# --- CASE A: Golden Tickers were found ---
log.info(f" -> Golden Ticker(s) Found: {', '.join(golden_tickers)}. Prioritizing these.")
all_tickers_found_in_post.update(golden_tickers)
# We only care about which of the golden tickers appeared in the title for the hybrid logic.
tickers_in_title = {ticker for ticker in golden_tickers if ticker in extract_golden_tickers(submission.title)}
is_golden_mode = bool(golden_tickers_in_post)
if is_golden_mode:
log.info(
f" -> Golden Ticker(s) Found: {', '.join(golden_tickers_in_post)}. Engaging Golden-Only Mode."
)
# In Golden Mode, we ONLY care about tickers with a '$'.
tickers_in_title = extract_golden_tickers(submission.title)
else:
# --- CASE B: No Golden Tickers, fall back to best-guess ---
log.info(" -> No Golden Tickers. Falling back to potential ticker search.")
# Now we search for potential tickers (e.g., 'GME' without a '$')
# In Potential Mode, we look for any valid-looking capitalized word.
tickers_in_title = extract_potential_tickers(submission.title)
all_tickers_found_in_post.update(tickers_in_title)
# 3. --- Mention Processing (This logic remains the same, but uses our cleanly identified tickers) ---
all_tickers_found_in_post = set(tickers_in_title)
ticker_id_cache = {}
# 2. --- Process Title Mentions ---
if tickers_in_title:
log.info(
f" -> Title Mention(s): {', '.join(tickers_in_title)}. Attributing all comments."
)
post_sentiment = get_sentiment_score(submission.title)
for ticker_symbol in tickers_in_title:
# All title tickers are saved as 'post' type mentions
ticker_id = database.get_or_create_entity(
conn, "tickers", "symbol", ticker_symbol
)
ticker_id_cache[ticker_symbol] = ticker_id
database.add_mention(
conn,
ticker_id,
subreddit_id,
submission.id,
"post",
int(submission.created_utc),
post_sentiment,
)
# 3. --- Process Comments (Single, Efficient Loop) ---
submission.comments.replace_more(limit=0)
all_comments = submission.comments.list()[:comment_limit]
# Process title mentions
if tickers_in_title:
log.info(f" -> Title Mention(s): {', '.join(tickers_in_title)}. Attributing all comments.")
post_sentiment = get_sentiment_score(submission.title)
for ticker_symbol in tickers_in_title:
ticker_id = database.get_or_create_entity(conn, 'tickers', 'symbol', ticker_symbol)
ticker_id_cache[ticker_symbol] = ticker_id
database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'post', int(submission.created_utc), post_sentiment)
# Process comments
for comment in all_comments:
comment_sentiment = get_sentiment_score(comment.body)
if tickers_in_title:
# If the title had tickers, every comment is a mention for them.
# We don't need to scan the comment text for tickers here.
for ticker_symbol in tickers_in_title:
ticker_id = ticker_id_cache[ticker_symbol]
database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'comment', int(comment.created_utc), comment_sentiment)
ticker_id = ticker_id_cache[ticker_symbol] # Guaranteed to be in cache
database.add_mention(
conn,
ticker_id,
subreddit_id,
submission.id,
"comment",
int(comment.created_utc),
comment_sentiment,
)
else:
# If no title tickers, we must scan comments for potential tickers
tickers_in_comment = extract_potential_tickers(comment.body)
# If no title tickers, we must scan the comment for direct mentions.
# The type of ticker we look for depends on the mode.
if is_golden_mode:
# This case is rare (no golden in title, but some in comments) but important.
tickers_in_comment = extract_golden_tickers(comment.body)
else:
tickers_in_comment = extract_potential_tickers(comment.body)
if tickers_in_comment:
all_tickers_found_in_post.update(tickers_in_comment)
for ticker_symbol in tickers_in_comment:
ticker_id = database.get_or_create_entity(conn, 'tickers', 'symbol', ticker_symbol)
database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'comment', int(comment.created_utc), comment_sentiment)
ticker_id = database.get_or_create_entity(
conn, "tickers", "symbol", ticker_symbol
)
database.add_mention(
conn,
ticker_id,
subreddit_id,
submission.id,
"comment",
int(comment.created_utc),
comment_sentiment,
)
# 4. --- Save Deep Dive and Return Tickers for Financial Update ---
# (This part is unchanged)
# 4. --- Save Deep Dive Analysis ---
all_comment_sentiments = [get_sentiment_score(c.body) for c in all_comments]
avg_sentiment = sum(all_comment_sentiments) / len(all_comment_sentiments) if all_comment_sentiments else 0
avg_sentiment = (
sum(all_comment_sentiments) / len(all_comment_sentiments)
if all_comment_sentiments
else 0
)
post_analysis_data = {
"post_id": submission.id, "title": submission.title,
"post_url": f"https://reddit.com{submission.permalink}", "subreddit_id": subreddit_id,
"post_timestamp": int(submission.created_utc), "comment_count": len(all_comments),
"avg_comment_sentiment": avg_sentiment
"post_id": submission.id,
"title": submission.title,
"post_url": f"https://reddit.com{submission.permalink}",
"subreddit_id": subreddit_id,
"post_timestamp": int(submission.created_utc),
"comment_count": len(all_comments),
"avg_comment_sentiment": avg_sentiment,
}
database.add_or_update_post_analysis(conn, post_analysis_data)
return all_tickers_found_in_post