Major improvement for discovering tickers. A GOLDEN TICKER IDEA!
This commit is contained in:
@@ -14,7 +14,7 @@ import yfinance as yf
|
||||
import pandas as pd
|
||||
|
||||
from . import database
|
||||
from .ticker_extractor import extract_tickers
|
||||
from .ticker_extractor import extract_golden_tickers, extract_potential_tickers
|
||||
from .sentiment_analyzer import get_sentiment_score
|
||||
from .logger_setup import setup_logging, logger as log
|
||||
|
||||
@@ -65,91 +65,75 @@ def fetch_financial_data(ticker_symbol):
|
||||
|
||||
def _process_submission(submission, subreddit_id, conn, comment_limit):
|
||||
"""
|
||||
Processes a single Reddit submission to find and save mentions.
|
||||
Crucially, it returns a set of all unique ticker symbols found.
|
||||
FINANCIAL DATA IS NOT FETCHED HERE.
|
||||
Processes a single Reddit submission using the "Golden Ticker" logic.
|
||||
- Prioritizes tickers with a '$' prefix.
|
||||
- Falls back to potential tickers only if no '$' tickers are found.
|
||||
"""
|
||||
tickers_in_title = set(extract_tickers(submission.title))
|
||||
all_tickers_found_in_post = set(tickers_in_title)
|
||||
ticker_id_cache = {}
|
||||
# 1. --- Golden Ticker Discovery ---
|
||||
# First, search the entire post (title and body) for high-confidence '$' tickers.
|
||||
post_text_for_discovery = submission.title + " " + submission.selftext
|
||||
golden_tickers = extract_golden_tickers(post_text_for_discovery)
|
||||
|
||||
tickers_in_title = set()
|
||||
comment_only_tickers = set()
|
||||
all_tickers_found_in_post = set()
|
||||
|
||||
# 2. --- Apply Contextual Logic ---
|
||||
if golden_tickers:
|
||||
# --- CASE A: Golden Tickers were found ---
|
||||
log.info(f" -> Golden Ticker(s) Found: {', '.join(golden_tickers)}. Prioritizing these.")
|
||||
all_tickers_found_in_post.update(golden_tickers)
|
||||
# We only care about which of the golden tickers appeared in the title for the hybrid logic.
|
||||
tickers_in_title = {ticker for ticker in golden_tickers if ticker in extract_golden_tickers(submission.title)}
|
||||
else:
|
||||
# --- CASE B: No Golden Tickers, fall back to best-guess ---
|
||||
log.info(" -> No Golden Tickers. Falling back to potential ticker search.")
|
||||
# Now we search for potential tickers (e.g., 'GME' without a '$')
|
||||
tickers_in_title = extract_potential_tickers(submission.title)
|
||||
all_tickers_found_in_post.update(tickers_in_title)
|
||||
|
||||
# 3. --- Mention Processing (This logic remains the same, but uses our cleanly identified tickers) ---
|
||||
ticker_id_cache = {}
|
||||
submission.comments.replace_more(limit=0)
|
||||
all_comments = submission.comments.list()[:comment_limit]
|
||||
|
||||
# Process title mentions first
|
||||
# Process title mentions
|
||||
if tickers_in_title:
|
||||
log.info(
|
||||
f" -> Title Mention(s): {', '.join(tickers_in_title)}. Attributing all comments."
|
||||
)
|
||||
log.info(f" -> Title Mention(s): {', '.join(tickers_in_title)}. Attributing all comments.")
|
||||
post_sentiment = get_sentiment_score(submission.title)
|
||||
for ticker_symbol in tickers_in_title:
|
||||
ticker_id = database.get_or_create_entity(
|
||||
conn, "tickers", "symbol", ticker_symbol
|
||||
)
|
||||
ticker_id = database.get_or_create_entity(conn, 'tickers', 'symbol', ticker_symbol)
|
||||
ticker_id_cache[ticker_symbol] = ticker_id
|
||||
database.add_mention(
|
||||
conn,
|
||||
ticker_id,
|
||||
subreddit_id,
|
||||
submission.id,
|
||||
"post",
|
||||
int(submission.created_utc),
|
||||
post_sentiment,
|
||||
)
|
||||
database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'post', int(submission.created_utc), post_sentiment)
|
||||
|
||||
# Process comments
|
||||
for comment in all_comments:
|
||||
comment_sentiment = get_sentiment_score(comment.body)
|
||||
if tickers_in_title:
|
||||
# If title has tickers, every comment is a mention for them
|
||||
for ticker_symbol in tickers_in_title:
|
||||
ticker_id = ticker_id_cache[ticker_symbol] # Guaranteed to be in cache
|
||||
database.add_mention(
|
||||
conn,
|
||||
ticker_id,
|
||||
subreddit_id,
|
||||
submission.id,
|
||||
"comment",
|
||||
int(comment.created_utc),
|
||||
comment_sentiment,
|
||||
)
|
||||
ticker_id = ticker_id_cache[ticker_symbol]
|
||||
database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'comment', int(comment.created_utc), comment_sentiment)
|
||||
else:
|
||||
# Otherwise, only direct mentions in comments count
|
||||
tickers_in_comment = set(extract_tickers(comment.body))
|
||||
# If no title tickers, we must scan comments for potential tickers
|
||||
tickers_in_comment = extract_potential_tickers(comment.body)
|
||||
if tickers_in_comment:
|
||||
all_tickers_found_in_post.update(tickers_in_comment)
|
||||
for ticker_symbol in tickers_in_comment:
|
||||
ticker_id = database.get_or_create_entity(
|
||||
conn, "tickers", "symbol", ticker_symbol
|
||||
)
|
||||
database.add_mention(
|
||||
conn,
|
||||
ticker_id,
|
||||
subreddit_id,
|
||||
submission.id,
|
||||
"comment",
|
||||
int(comment.created_utc),
|
||||
comment_sentiment,
|
||||
)
|
||||
ticker_id = database.get_or_create_entity(conn, 'tickers', 'symbol', ticker_symbol)
|
||||
database.add_mention(conn, ticker_id, subreddit_id, submission.id, 'comment', int(comment.created_utc), comment_sentiment)
|
||||
|
||||
# Save deep dive analysis (this is separate from mention counting)
|
||||
# 4. --- Save Deep Dive and Return Tickers for Financial Update ---
|
||||
# (This part is unchanged)
|
||||
all_comment_sentiments = [get_sentiment_score(c.body) for c in all_comments]
|
||||
avg_sentiment = (
|
||||
sum(all_comment_sentiments) / len(all_comment_sentiments)
|
||||
if all_comment_sentiments
|
||||
else 0
|
||||
)
|
||||
avg_sentiment = sum(all_comment_sentiments) / len(all_comment_sentiments) if all_comment_sentiments else 0
|
||||
post_analysis_data = {
|
||||
"post_id": submission.id,
|
||||
"title": submission.title,
|
||||
"post_url": f"https://reddit.com{submission.permalink}",
|
||||
"subreddit_id": subreddit_id,
|
||||
"post_timestamp": int(submission.created_utc),
|
||||
"comment_count": len(all_comments),
|
||||
"avg_comment_sentiment": avg_sentiment,
|
||||
"post_id": submission.id, "title": submission.title,
|
||||
"post_url": f"https://reddit.com{submission.permalink}", "subreddit_id": subreddit_id,
|
||||
"post_timestamp": int(submission.created_utc), "comment_count": len(all_comments),
|
||||
"avg_comment_sentiment": avg_sentiment
|
||||
}
|
||||
database.add_or_update_post_analysis(conn, post_analysis_data)
|
||||
|
||||
|
||||
return all_tickers_found_in_post
|
||||
|
||||
|
||||
|
@@ -119,23 +119,26 @@ COMMON_WORDS_BLACKLIST = {
|
||||
"ZEN", "ZERO", "ZEV"
|
||||
}
|
||||
|
||||
def extract_tickers(text):
|
||||
def extract_golden_tickers(text):
|
||||
"""
|
||||
Extracts potential stock tickers from a given piece of text.
|
||||
A ticker is identified as a 1-5 character uppercase word, or a word prefixed with $.
|
||||
Extracts ONLY tickers with a '$' prefix. This is the highest-confidence signal.
|
||||
Returns a set of cleaned ticker symbols (e.g., {'TSLA', 'GME'}).
|
||||
"""
|
||||
# Regex to find potential tickers:
|
||||
# 1. Words prefixed with $: $AAPL, $TSLA
|
||||
# 2. All-caps words between 1 and 5 characters: GME, AMC
|
||||
ticker_regex = r"\$[A-Z]{1,5}\b|\b[A-Z]{2,5}\b"
|
||||
# Regex to find words prefixed with $: $AAPL, $TSLA
|
||||
ticker_regex = r"\$[A-Z]{1,5}\b"
|
||||
tickers = re.findall(ticker_regex, text)
|
||||
# Clean the tickers by removing the '$' and return as a set
|
||||
return {ticker.replace("$", "").upper() for ticker in tickers}
|
||||
|
||||
def extract_potential_tickers(text):
|
||||
"""
|
||||
Extracts potential tickers (all-caps words). This is a lower-confidence signal
|
||||
used as a fallback when no golden tickers are present.
|
||||
Returns a set of cleaned ticker symbols.
|
||||
"""
|
||||
# Regex to find all-caps words between 2 and 5 characters: GME, AMC
|
||||
ticker_regex = r"\b[A-Z]{2,5}\b"
|
||||
potential_tickers = re.findall(ticker_regex, text)
|
||||
|
||||
# Filter out common words and remove the '$' prefix
|
||||
tickers = []
|
||||
for ticker in potential_tickers:
|
||||
cleaned_ticker = ticker.replace("$", "").upper()
|
||||
if cleaned_ticker not in COMMON_WORDS_BLACKLIST:
|
||||
tickers.append(cleaned_ticker)
|
||||
|
||||
return tickers
|
||||
|
||||
# Filter out common blacklisted words
|
||||
return {ticker for ticker in potential_tickers if ticker not in COMMON_WORDS_BLACKLIST}
|
||||
|
Reference in New Issue
Block a user