Major improvement for discovering tickers. A GOLDEN TICKER IDEA!
This commit is contained in:
@@ -119,23 +119,26 @@ COMMON_WORDS_BLACKLIST = {
|
||||
"ZEN", "ZERO", "ZEV"
|
||||
}
|
||||
|
||||
def extract_tickers(text):
|
||||
def extract_golden_tickers(text):
|
||||
"""
|
||||
Extracts potential stock tickers from a given piece of text.
|
||||
A ticker is identified as a 1-5 character uppercase word, or a word prefixed with $.
|
||||
Extracts ONLY tickers with a '$' prefix. This is the highest-confidence signal.
|
||||
Returns a set of cleaned ticker symbols (e.g., {'TSLA', 'GME'}).
|
||||
"""
|
||||
# Regex to find potential tickers:
|
||||
# 1. Words prefixed with $: $AAPL, $TSLA
|
||||
# 2. All-caps words between 1 and 5 characters: GME, AMC
|
||||
ticker_regex = r"\$[A-Z]{1,5}\b|\b[A-Z]{2,5}\b"
|
||||
# Regex to find words prefixed with $: $AAPL, $TSLA
|
||||
ticker_regex = r"\$[A-Z]{1,5}\b"
|
||||
tickers = re.findall(ticker_regex, text)
|
||||
# Clean the tickers by removing the '$' and return as a set
|
||||
return {ticker.replace("$", "").upper() for ticker in tickers}
|
||||
|
||||
def extract_potential_tickers(text):
|
||||
"""
|
||||
Extracts potential tickers (all-caps words). This is a lower-confidence signal
|
||||
used as a fallback when no golden tickers are present.
|
||||
Returns a set of cleaned ticker symbols.
|
||||
"""
|
||||
# Regex to find all-caps words between 2 and 5 characters: GME, AMC
|
||||
ticker_regex = r"\b[A-Z]{2,5}\b"
|
||||
potential_tickers = re.findall(ticker_regex, text)
|
||||
|
||||
# Filter out common words and remove the '$' prefix
|
||||
tickers = []
|
||||
for ticker in potential_tickers:
|
||||
cleaned_ticker = ticker.replace("$", "").upper()
|
||||
if cleaned_ticker not in COMMON_WORDS_BLACKLIST:
|
||||
tickers.append(cleaned_ticker)
|
||||
|
||||
return tickers
|
||||
|
||||
# Filter out common blacklisted words
|
||||
return {ticker for ticker in potential_tickers if ticker not in COMMON_WORDS_BLACKLIST}
|
||||
|
Reference in New Issue
Block a user