Major improvement for discovering tickers. A GOLDEN TICKER IDEA!

This commit is contained in:
2025-07-29 21:51:30 +02:00
parent 8a80df5946
commit 9e5455592b
2 changed files with 65 additions and 78 deletions

View File

@@ -119,23 +119,26 @@ COMMON_WORDS_BLACKLIST = {
"ZEN", "ZERO", "ZEV"
}
def extract_tickers(text):
def extract_golden_tickers(text):
"""
Extracts potential stock tickers from a given piece of text.
A ticker is identified as a 1-5 character uppercase word, or a word prefixed with $.
Extracts ONLY tickers with a '$' prefix. This is the highest-confidence signal.
Returns a set of cleaned ticker symbols (e.g., {'TSLA', 'GME'}).
"""
# Regex to find potential tickers:
# 1. Words prefixed with $: $AAPL, $TSLA
# 2. All-caps words between 1 and 5 characters: GME, AMC
ticker_regex = r"\$[A-Z]{1,5}\b|\b[A-Z]{2,5}\b"
# Regex to find words prefixed with $: $AAPL, $TSLA
ticker_regex = r"\$[A-Z]{1,5}\b"
tickers = re.findall(ticker_regex, text)
# Clean the tickers by removing the '$' and return as a set
return {ticker.replace("$", "").upper() for ticker in tickers}
def extract_potential_tickers(text):
"""
Extracts potential tickers (all-caps words). This is a lower-confidence signal
used as a fallback when no golden tickers are present.
Returns a set of cleaned ticker symbols.
"""
# Regex to find all-caps words between 2 and 5 characters: GME, AMC
ticker_regex = r"\b[A-Z]{2,5}\b"
potential_tickers = re.findall(ticker_regex, text)
# Filter out common words and remove the '$' prefix
tickers = []
for ticker in potential_tickers:
cleaned_ticker = ticker.replace("$", "").upper()
if cleaned_ticker not in COMMON_WORDS_BLACKLIST:
tickers.append(cleaned_ticker)
return tickers
# Filter out common blacklisted words
return {ticker for ticker in potential_tickers if ticker not in COMMON_WORDS_BLACKLIST}