Sentiment analyzis.
This commit is contained in:
72
database.py
72
database.py
@@ -12,11 +12,13 @@ def get_db_connection():
|
||||
return conn
|
||||
|
||||
def initialize_db():
|
||||
"""Initializes the database and creates tables if they don't exist."""
|
||||
"""
|
||||
Initializes the database and creates the necessary tables if they don't exist.
|
||||
"""
|
||||
conn = get_db_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# --- Create tickers table ---
|
||||
# --- Create tickers table (This is the corrected section) ---
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS tickers (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
@@ -26,7 +28,7 @@ def initialize_db():
|
||||
)
|
||||
""")
|
||||
|
||||
# --- Create subreddits table ---
|
||||
# --- Create subreddits table (This is the corrected section) ---
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS subreddits (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
@@ -34,7 +36,7 @@ def initialize_db():
|
||||
)
|
||||
""")
|
||||
|
||||
# --- Create mentions table ---
|
||||
# --- Create mentions table with sentiment_score column ---
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS mentions (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
@@ -53,12 +55,23 @@ def initialize_db():
|
||||
conn.close()
|
||||
print("Database initialized successfully.")
|
||||
|
||||
def add_mention(conn, ticker_id, subreddit_id, post_id, timestamp, sentiment):
|
||||
"""Adds a new mention with its sentiment score to the database."""
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
cursor.execute(
|
||||
"INSERT INTO mentions (ticker_id, subreddit_id, post_id, mention_timestamp, sentiment_score) VALUES (?, ?, ?, ?, ?)",
|
||||
(ticker_id, subreddit_id, post_id, timestamp, sentiment)
|
||||
)
|
||||
conn.commit()
|
||||
except sqlite3.IntegrityError:
|
||||
pass # Ignore duplicate mentions
|
||||
|
||||
def get_or_create_entity(conn, table_name, column_name, value):
|
||||
"""Generic function to get or create an entity and return its ID."""
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(f"SELECT id FROM {table_name} WHERE {column_name} = ?", (value,))
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result:
|
||||
return result['id']
|
||||
else:
|
||||
@@ -66,18 +79,6 @@ def get_or_create_entity(conn, table_name, column_name, value):
|
||||
conn.commit()
|
||||
return cursor.lastrowid
|
||||
|
||||
def add_mention(conn, ticker_id, subreddit_id, post_id, timestamp):
|
||||
"""Adds a new mention to the database, ignoring duplicates."""
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
cursor.execute(
|
||||
"INSERT INTO mentions (ticker_id, subreddit_id, post_id, mention_timestamp) VALUES (?, ?, ?, ?)",
|
||||
(ticker_id, subreddit_id, post_id, timestamp)
|
||||
)
|
||||
conn.commit()
|
||||
except sqlite3.IntegrityError:
|
||||
pass
|
||||
|
||||
def update_ticker_market_cap(conn, ticker_id, market_cap):
|
||||
"""Updates the market cap and timestamp for a specific ticker."""
|
||||
cursor = conn.cursor()
|
||||
@@ -95,7 +96,7 @@ def get_ticker_info(conn, ticker_id):
|
||||
return cursor.fetchone()
|
||||
|
||||
def generate_summary_report():
|
||||
"""Queries the DB to generate and print a summary with market caps."""
|
||||
"""Queries the DB to generate a summary with market caps and avg. sentiment."""
|
||||
print("\n--- Summary Report ---")
|
||||
conn = get_db_connection()
|
||||
cursor = conn.cursor()
|
||||
@@ -104,33 +105,38 @@ def generate_summary_report():
|
||||
SELECT
|
||||
t.symbol,
|
||||
t.market_cap,
|
||||
COUNT(m.id) as mention_count
|
||||
COUNT(m.id) as mention_count,
|
||||
AVG(m.sentiment_score) as avg_sentiment
|
||||
FROM mentions m
|
||||
JOIN tickers t ON m.ticker_id = t.id
|
||||
GROUP BY t.symbol, t.market_cap
|
||||
ORDER BY mention_count DESC
|
||||
LIMIT 20;
|
||||
"""
|
||||
|
||||
results = cursor.execute(query).fetchall()
|
||||
|
||||
print(f"{'Ticker':<10} | {'Mentions':<10} | {'Market Cap':<20}")
|
||||
print("-" * 45)
|
||||
print(f"{'Ticker':<10} | {'Mentions':<10} | {'Sentiment':<18} | {'Market Cap':<20}")
|
||||
print("-" * 65)
|
||||
|
||||
for row in results:
|
||||
# Format Market Cap
|
||||
market_cap_str = "N/A"
|
||||
if row['market_cap']:
|
||||
# Format market cap into a readable string (e.g., $1.23T, $45.6B, $123.4M)
|
||||
if row['market_cap'] and row['market_cap'] > 0:
|
||||
mc = row['market_cap']
|
||||
if mc >= 1e12:
|
||||
market_cap_str = f"${mc/1e12:.2f}T"
|
||||
elif mc >= 1e9:
|
||||
market_cap_str = f"${mc/1e9:.2f}B"
|
||||
elif mc >= 1e6:
|
||||
market_cap_str = f"${mc/1e6:.2f}M"
|
||||
else:
|
||||
market_cap_str = f"${mc:,}"
|
||||
if mc >= 1e12: market_cap_str = f"${mc/1e12:.2f}T"
|
||||
elif mc >= 1e9: market_cap_str = f"${mc/1e9:.2f}B"
|
||||
elif mc >= 1e6: market_cap_str = f"${mc/1e6:.2f}M"
|
||||
else: market_cap_str = f"${mc:,}"
|
||||
|
||||
print(f"{row['symbol']:<10} | {row['mention_count']:<10} | {market_cap_str:<20}")
|
||||
# Determine Sentiment Label
|
||||
sentiment_score = row['avg_sentiment']
|
||||
if sentiment_score is not None:
|
||||
if sentiment_score > 0.1: sentiment_label = f"Bullish ({sentiment_score:+.2f})"
|
||||
elif sentiment_score < -0.1: sentiment_label = f"Bearish ({sentiment_score:+.2f})"
|
||||
else: sentiment_label = f"Neutral ({sentiment_score:+.2f})"
|
||||
else:
|
||||
sentiment_label = "N/A"
|
||||
|
||||
print(f"{row['symbol']:<10} | {row['mention_count']:<10} | {sentiment_label:<18} | {market_cap_str:<20}")
|
||||
|
||||
conn.close()
|
68
main.py
68
main.py
@@ -11,49 +11,40 @@ from dotenv import load_dotenv
|
||||
|
||||
import database
|
||||
from ticker_extractor import extract_tickers
|
||||
from sentiment_analyzer import get_sentiment_score # <-- IMPORT OUR NEW MODULE
|
||||
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
# How old (in seconds) market cap data can be before we refresh it. 24 hours = 86400 seconds.
|
||||
MARKET_CAP_REFRESH_INTERVAL = 86400
|
||||
|
||||
# ... (load_subreddits, get_market_cap, get_reddit_instance functions are unchanged)
|
||||
def load_subreddits(filepath):
|
||||
# (This function is unchanged)
|
||||
# ...
|
||||
try:
|
||||
with open(filepath, 'r') as f:
|
||||
data = json.load(f)
|
||||
return data.get("subreddits", [])
|
||||
except FileNotFoundError:
|
||||
print(f"Error: The file '{filepath}' was not found.")
|
||||
return None
|
||||
except json.JSONDecodeError:
|
||||
print(f"Error: Could not decode JSON from '{filepath}'.")
|
||||
with open(filepath, 'r') as f: return json.load(f).get("subreddits", [])
|
||||
except (FileNotFoundError, json.JSONDecodeError) as e:
|
||||
print(f"Error loading config: {e}")
|
||||
return None
|
||||
|
||||
def get_market_cap(ticker_symbol):
|
||||
"""Fetches the market capitalization for a given stock ticker from yfinance."""
|
||||
# ...
|
||||
try:
|
||||
ticker = yf.Ticker(ticker_symbol)
|
||||
# .info can be slow; .fast_info is a lighter alternative
|
||||
market_cap = ticker.fast_info.get('marketCap')
|
||||
return market_cap if market_cap else None
|
||||
except Exception:
|
||||
return None
|
||||
return ticker.fast_info.get('marketCap')
|
||||
except Exception: return None
|
||||
|
||||
def get_reddit_instance():
|
||||
# (This function is unchanged)
|
||||
# ...
|
||||
client_id = os.getenv("REDDIT_CLIENT_ID")
|
||||
client_secret = os.getenv("REDDIT_CLIENT_SECRET")
|
||||
user_agent = os.getenv("REDDIT_USER_AGENT")
|
||||
|
||||
if not all([client_id, client_secret, user_agent]):
|
||||
print("Error: Reddit API credentials not found in .env file.")
|
||||
print("Error: Reddit API credentials not found.")
|
||||
return None
|
||||
return praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent)
|
||||
|
||||
|
||||
def scan_subreddits(reddit, subreddits_list, post_limit=25):
|
||||
"""Scans subreddits, stores mentions, and updates market caps in the database."""
|
||||
"""Scans subreddits, performs sentiment analysis, and stores results in the database."""
|
||||
conn = database.get_db_connection()
|
||||
|
||||
print(f"\nScanning {len(subreddits_list)} subreddits for top {post_limit} posts...")
|
||||
@@ -64,32 +55,34 @@ def scan_subreddits(reddit, subreddits_list, post_limit=25):
|
||||
print(f"Scanning r/{subreddit_name}...")
|
||||
|
||||
for submission in subreddit.hot(limit=post_limit):
|
||||
full_text = submission.title + " " + submission.selftext
|
||||
tickers_in_post = extract_tickers(full_text)
|
||||
# We analyze the title for sentiment as it's often the most concise summary.
|
||||
# Analyzing all comments could be a future enhancement.
|
||||
text_to_analyze = submission.title
|
||||
tickers_in_post = extract_tickers(text_to_analyze + " " + submission.selftext)
|
||||
|
||||
# --- NEW: Get sentiment score for the post's title ---
|
||||
sentiment = get_sentiment_score(text_to_analyze)
|
||||
|
||||
for ticker_symbol in set(tickers_in_post):
|
||||
ticker_id = database.get_or_create_entity(conn, 'tickers', 'symbol', ticker_symbol)
|
||||
|
||||
# --- NEW: Pass the sentiment score to the database ---
|
||||
database.add_mention(
|
||||
conn,
|
||||
ticker_id=ticker_id,
|
||||
subreddit_id=subreddit_id,
|
||||
post_id=submission.id,
|
||||
timestamp=int(submission.created_utc)
|
||||
timestamp=int(submission.created_utc),
|
||||
sentiment=sentiment # Pass the score here
|
||||
)
|
||||
|
||||
# --- Check if market cap needs updating ---
|
||||
# (The market cap update logic remains the same)
|
||||
ticker_info = database.get_ticker_info(conn, ticker_id)
|
||||
current_time = int(time.time())
|
||||
|
||||
if not ticker_info['last_updated'] or (current_time - ticker_info['last_updated'] > MARKET_CAP_REFRESH_INTERVAL):
|
||||
print(f" -> Fetching market cap for {ticker_symbol}...")
|
||||
market_cap = get_market_cap(ticker_symbol)
|
||||
if market_cap:
|
||||
database.update_ticker_market_cap(conn, ticker_id, market_cap)
|
||||
else:
|
||||
# If fetch fails, still update the timestamp so we don't try again for 24 hours
|
||||
database.update_ticker_market_cap(conn, ticker_id, ticker_info['market_cap']) # Keep old value
|
||||
database.update_ticker_market_cap(conn, ticker_id, market_cap or ticker_info['market_cap'])
|
||||
|
||||
except Exception as e:
|
||||
print(f"Could not scan r/{subreddit_name}. Error: {e}")
|
||||
@@ -97,28 +90,23 @@ def scan_subreddits(reddit, subreddits_list, post_limit=25):
|
||||
conn.close()
|
||||
print("\n--- Scan Complete ---")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to run the Reddit stock analysis tool."""
|
||||
# --- IMPORTANT: Delete your old DB file before running! ---
|
||||
# Since we changed the schema and logic, old data won't have sentiment.
|
||||
# It's best to start fresh. Delete the `reddit_stocks.db` file now.
|
||||
|
||||
parser = argparse.ArgumentParser(description="Analyze stock ticker mentions on Reddit.")
|
||||
parser.add_argument("config_file", help="Path to the JSON file containing subreddits.")
|
||||
args = parser.parse_args()
|
||||
|
||||
# --- Part 1: Initialize ---
|
||||
database.initialize_db()
|
||||
|
||||
subreddits = load_subreddits(args.config_file)
|
||||
if not subreddits: return
|
||||
|
||||
reddit = get_reddit_instance()
|
||||
if not reddit: return
|
||||
|
||||
# --- Part 2: Scan and Store ---
|
||||
scan_subreddits(reddit, subreddits)
|
||||
|
||||
# --- Part 3: Generate and Display Report ---
|
||||
database.generate_summary_report()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
BIN
reddit_stocks.db
Normal file
BIN
reddit_stocks.db
Normal file
Binary file not shown.
@@ -1,3 +1,4 @@
|
||||
yfinance
|
||||
praw
|
||||
python-dotenv
|
||||
nltk
|
19
sentiment_analyzer.py
Normal file
19
sentiment_analyzer.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# sentiment_analyzer.py
|
||||
|
||||
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
||||
|
||||
# Initialize the VADER sentiment intensity analyzer
|
||||
# We only need to create one instance of this.
|
||||
_analyzer = SentimentIntensityAnalyzer()
|
||||
|
||||
def get_sentiment_score(text):
|
||||
"""
|
||||
Analyzes a piece of text and returns its sentiment score.
|
||||
|
||||
The 'compound' score is a single metric that summarizes the sentiment.
|
||||
It ranges from -1 (most negative) to +1 (most positive).
|
||||
"""
|
||||
# The polarity_scores() method returns a dictionary with 'neg', 'neu', 'pos', and 'compound' scores.
|
||||
# We are most interested in the 'compound' score.
|
||||
scores = _analyzer.polarity_scores(text)
|
||||
return scores['compound']
|
11
setup_nltk.py
Normal file
11
setup_nltk.py
Normal file
@@ -0,0 +1,11 @@
|
||||
import nltk
|
||||
|
||||
# This will download the 'vader_lexicon' dataset
|
||||
# It only needs to be run once
|
||||
try:
|
||||
nltk.data.find('sentiment/vader_lexicon.zip')
|
||||
print("VADER lexicon is already downloaded.")
|
||||
except LookupError:
|
||||
print("Downloading VADER lexicon...")
|
||||
nltk.download('vader_lexicon')
|
||||
print("Download complete.")
|
@@ -19,9 +19,31 @@ COMMON_WORDS_BLACKLIST = {
|
||||
"NASDAQ", "NYSE", "AMEX", "FTSE", "DAX", "WSB", "SPX", "DJIA",
|
||||
"EDGAR", "GDP", "CPI", "PPI", "PMI", "ISM", "FOMC", "ECB", "BOE",
|
||||
"BOJ", "RBA", "RBNZ", "BIS", "NFA", "P", "VOO", "CTB", "DR",
|
||||
"ETF", "EV", "ESG", "REIT", "SPAC", "IPO", "M&A", "LBO",
|
||||
"ETF", "EV", "ESG", "REIT", "SPAC", "IPO", "M&A", "LBO", "PE",
|
||||
"Q1", "Q2", "Q3", "Q4", "FY", "FAQ", "ROI", "ROE", "EPS", "P/E", "PEG",
|
||||
"FRG", "FXAIX", "FXIAX", "FZROX"
|
||||
"FRG", "FXAIX", "FXIAX", "FZROX", "BULL", "BEAR", "BULLISH", "BEARISH",
|
||||
"QQQ", "SPY", "DIA", "IWM", "VTI", "VOO", "IVV", "SCHB", "SPLG",
|
||||
"ROTH", "IRA", "401K", "403B", "457B", "SEP", "SIMPLE", "HSA",
|
||||
"LONG", "SHORT", "LEVERAGE", "MARGIN", "HEDGE", "SWING", "DAY",
|
||||
"GRAB", "GPU", "MY", "PSA", "AMA", "DM", "OP", "SPAC", "FIHTX",
|
||||
"FINTX", "FINT", "FINTX", "FINTY", "FSPSX", "TOTAL", "LARGE", "MID", "SMALL",
|
||||
"GROWTH", "VALUE", "BLEND", "INCOME", "DIV", "YIELD", "BETA", "ALPHA", "VOLATILITY",
|
||||
"RISK", "RETURN", "SHARPE", "SORTINO", "MAX", "MIN", "STDDEV", "VARIANCE",
|
||||
"PDF", "FULL", "PEAK", "LATE", "EARLY", "MIDDAY", "NIGHT", "MORNING", "AFTERNOON",
|
||||
"CYCLE", "TREND", "PATTERN", "BREAKOUT", "PULLBACK", "REVERSAL", "CONSOLIDATION",
|
||||
"OTC", "TRUE", "FALSE", "NULL", "NONE", "ALL", "ANY", "SOME", "EACH", "EVERY",
|
||||
"STILL", "TERM", "TIME", "DATE", "YEAR", "MONTH", "WEEK", "HOUR", "MINUTE", "SECOND",
|
||||
"JUST", "ALREADY", "STILL", "YET", "NOW", "LATER", "SOON", "EARLIER", "TODAY", "TOMORROW",
|
||||
"YESTERDAY", "TONIGHT", "THIS", "LAST", "NEXT", "WOULD", "SHOULD", "COULD", "MIGHT",
|
||||
"WILL", "CAN", "MUST", "SHALL", "OUGHT", "TAKE", "MAKE", "HAVE", "GET", "DO", "BE",
|
||||
"GO", "COME", "SEE", "LOOK", "WATCH", "HEAR", "YES", "NO", "OK", "LIKE", "LOVE", "HATE",
|
||||
"WANT", "NEED", "THINK", "BELIEVE", "KNOW", "PRICE", "COST", "VALUE", "WORTH",
|
||||
"EXPENSE", "SPEND", "SAVE", "EARN", "PROFIT", "LOSS", "GAIN", "DEBT", "CREDIT",
|
||||
"BOND", "STOCK", "SHARE", "FUND", "ASSET", "LIABILITY", "BUZZ", "UNDER", "OVER", "BETWEEN",
|
||||
"FRAUD", "SCAM", "RISK", "REWARD", "RETURN", "INVEST", "TRADE", "BUY", "SELL", "HOLD",
|
||||
"SHORT", "LONG", "LEVERAGE", "MARGIN", "HEDGE", "SCALP", "POSITION",
|
||||
"PLAN", "GOAL", "WILL", "FAST", "HINT", "ABOVE", "BELOW", "AROUND", "NEAR", "FAR",
|
||||
"TL",
|
||||
}
|
||||
|
||||
def extract_tickers(text):
|
||||
|
Reference in New Issue
Block a user