Major improvement for discovering tickers. A GOLDEN TICKER IDEA!

2025-07-29 21:51:30 +02:00
parent 8a80df5946
commit 9e5455592b
2 changed files with 65 additions and 78 deletions
--- a/rstat_tool/ticker_extractor.py
+++ b/rstat_tool/ticker_extractor.py
@@ -119,23 +119,26 @@ COMMON_WORDS_BLACKLIST = {
    "ZEN", "ZERO", "ZEV"
 }

-def extract_tickers(text):
+def extract_golden_tickers(text):
    """
-    Extracts potential stock tickers from a given piece of text.
-    A ticker is identified as a 1-5 character uppercase word, or a word prefixed with $.
+    Extracts ONLY tickers with a '$' prefix. This is the highest-confidence signal.
+    Returns a set of cleaned ticker symbols (e.g., {'TSLA', 'GME'}).
    """
-    # Regex to find potential tickers:
-    # 1. Words prefixed with $: $AAPL, $TSLA
-    # 2. All-caps words between 1 and 5 characters: GME, AMC
-    ticker_regex = r"\$[A-Z]{1,5}\b|\b[A-Z]{2,5}\b"
+    # Regex to find words prefixed with $: $AAPL, $TSLA
+    ticker_regex = r"\$[A-Z]{1,5}\b"
+    tickers = re.findall(ticker_regex, text)
+    # Clean the tickers by removing the '$' and return as a set
+    return {ticker.replace("$", "").upper() for ticker in tickers}

+def extract_potential_tickers(text):
+    """
+    Extracts potential tickers (all-caps words). This is a lower-confidence signal
+    used as a fallback when no golden tickers are present.
+    Returns a set of cleaned ticker symbols.
+    """
+    # Regex to find all-caps words between 2 and 5 characters: GME, AMC
+    ticker_regex = r"\b[A-Z]{2,5}\b"
    potential_tickers = re.findall(ticker_regex, text)
-
-    # Filter out common words and remove the '$' prefix
-    tickers = []
-    for ticker in potential_tickers:
-        cleaned_ticker = ticker.replace("$", "").upper()
-        if cleaned_ticker not in COMMON_WORDS_BLACKLIST:
-            tickers.append(cleaned_ticker)
-
-    return tickers
+    
+    # Filter out common blacklisted words
+    return {ticker for ticker in potential_tickers if ticker not in COMMON_WORDS_BLACKLIST}