Scraping Yahoo Finance Stock Data in 2026: yfinance, Direct API, and Fallbacks
Scraping Yahoo Finance Stock Data in 2026
Yahoo Finance remains the best free source for stock market data. Bloomberg Terminal costs $24,000/year. Alpha Vantage caps you at 25 requests/day on the free tier. Yahoo Finance gives you real-time quotes, 20+ years of historical data, full financial statements, and earnings data — all without an API key.
The catch: Yahoo doesn't officially support any of this. They shut down their public API in 2017, and the unofficial endpoints that replaced it break every few months when Yahoo updates their authentication flow. This guide covers what works right now, how to handle the inevitable breakage, and when to use alternatives.
Installation
pip install yfinance httpx pandas sqlite3
For proxy support and production usage:
pip install httpx[socks] curl-cffi
Method 1: yfinance (Start Here)
The yfinance library wraps Yahoo Finance's internal API. It handles session management, cookie consent, and crumb tokens automatically. Start with yfinance — it's the path of least resistance:
import yfinance as yf
import pandas as pd
import json
from datetime import datetime
def get_stock_overview(ticker: str) -> dict:
"""Pull key metrics for a single stock."""
stock = yf.Ticker(ticker)
info = stock.info
return {
"ticker": ticker,
"name": info.get("longName"),
"price": info.get("currentPrice") or info.get("regularMarketPrice"),
"previous_close": info.get("previousClose"),
"open": info.get("open"),
"day_high": info.get("dayHigh"),
"day_low": info.get("dayLow"),
"market_cap": info.get("marketCap"),
"enterprise_value": info.get("enterpriseValue"),
"pe_ratio": info.get("trailingPE"),
"forward_pe": info.get("forwardPE"),
"peg_ratio": info.get("pegRatio"),
"price_to_sales": info.get("priceToSalesTrailing12Months"),
"price_to_book": info.get("priceToBook"),
"ev_to_ebitda": info.get("enterpriseToEbitda"),
"dividend_yield": info.get("dividendYield"),
"dividend_rate": info.get("dividendRate"),
"payout_ratio": info.get("payoutRatio"),
"beta": info.get("beta"),
"52w_high": info.get("fiftyTwoWeekHigh"),
"52w_low": info.get("fiftyTwoWeekLow"),
"50d_ma": info.get("fiftyDayAverage"),
"200d_ma": info.get("twoHundredDayAverage"),
"avg_volume": info.get("averageVolume"),
"avg_volume_10d": info.get("averageVolume10days"),
"float_shares": info.get("floatShares"),
"shares_outstanding": info.get("sharesOutstanding"),
"short_ratio": info.get("shortRatio"),
"short_percent": info.get("shortPercentOfFloat"),
"sector": info.get("sector"),
"industry": info.get("industry"),
"country": info.get("country"),
"exchange": info.get("exchange"),
"website": info.get("website"),
"description": info.get("longBusinessSummary"),
}
# Example
overview = get_stock_overview("AAPL")
print(json.dumps({k: v for k, v in overview.items() if v is not None}, indent=2))
Historical Price Data
yfinance's history() method is the cleanest interface for OHLCV data:
def get_historical_prices(
ticker: str,
period: str = "1y",
interval: str = "1d",
start: str = None,
end: str = None,
auto_adjust: bool = True,
) -> pd.DataFrame:
"""
Fetch historical OHLCV data.
period options: 1d, 5d, 1mo, 3mo, 6mo, 1y, 2y, 5y, 10y, ytd, max
interval options: 1m, 2m, 5m, 15m, 30m, 60m, 90m, 1h, 1d, 5d, 1wk, 1mo, 3mo
Note: 1m data is only available for the last 7 days
1h data is only available for the last 730 days
auto_adjust=True adjusts for splits and dividends (recommended)
"""
stock = yf.Ticker(ticker)
if start:
df = stock.history(start=start, end=end, interval=interval, auto_adjust=auto_adjust)
else:
df = stock.history(period=period, interval=interval, auto_adjust=auto_adjust)
if df.empty:
print(f"Warning: No data returned for {ticker}")
return df
# Remove timezone info for easier handling downstream
if hasattr(df.index, "tz") and df.index.tz is not None:
df.index = df.index.tz_localize(None)
# Round to avoid floating point noise
df = df.round(4)
# Drop the Dividends and Stock Splits columns if you don't need them
df = df[["Open", "High", "Low", "Close", "Volume"]]
return df
# 1 year of daily AAPL data
aapl = get_historical_prices("AAPL", period="1y")
print(f"AAPL — {len(aapl)} trading days")
print(aapl.tail(5).to_string())
# 5-minute intraday data for MSFT (last 5 days only)
msft_intraday = get_historical_prices("MSFT", period="5d", interval="5m")
print(f"\nMSFT intraday — {len(msft_intraday)} 5-minute bars")
# Historical with specific dates
nvda_2025 = get_historical_prices(
"NVDA",
start="2025-01-01",
end="2025-12-31",
)
print(f"\nNVDA 2025 — {len(nvda_2025)} trading days")
print(f"2025 return: {((nvda_2025['Close'].iloc[-1] / nvda_2025['Close'].iloc[0]) - 1) * 100:.1f}%")
Financial Statements
Yahoo Finance provides quarterly and annual income statements, balance sheets, and cash flow statements:
def get_financials(ticker: str) -> dict:
"""Pull quarterly and annual financial statements."""
stock = yf.Ticker(ticker)
return {
"income_quarterly": stock.quarterly_income_stmt,
"income_annual": stock.income_stmt,
"balance_quarterly": stock.quarterly_balance_sheet,
"balance_annual": stock.balance_sheet,
"cashflow_quarterly": stock.quarterly_cashflow,
"cashflow_annual": stock.cashflow,
}
def print_revenue_trend(ticker: str):
"""Print quarterly revenue trend for a stock."""
stock = yf.Ticker(ticker)
income = stock.quarterly_income_stmt
if income is None or income.empty:
print(f"No income statement data for {ticker}")
return
if "Total Revenue" in income.index:
revenue = income.loc["Total Revenue"].dropna()
print(f"\n{ticker} Quarterly Revenue:")
for date, val in sorted(revenue.items()):
yoy_label = ""
print(f" {date.strftime('%Y-Q?')}: ${val/1e9:.2f}B{yoy_label}")
else:
available = list(income.index[:10])
print(f"'Total Revenue' not found. Available rows: {available}")
def get_key_metrics(ticker: str) -> dict:
"""Extract key financial metrics from statements."""
stock = yf.Ticker(ticker)
income = stock.income_stmt
balance = stock.balance_sheet
cashflow = stock.cashflow
metrics = {}
# From income statement
if income is not None and not income.empty:
latest = income.columns[0] # Most recent period
metrics["revenue_ttm"] = income.loc["Total Revenue", latest] if "Total Revenue" in income.index else None
metrics["gross_profit_ttm"] = income.loc["Gross Profit", latest] if "Gross Profit" in income.index else None
metrics["operating_income_ttm"] = income.loc["Operating Income", latest] if "Operating Income" in income.index else None
metrics["net_income_ttm"] = income.loc["Net Income", latest] if "Net Income" in income.index else None
metrics["ebitda_ttm"] = income.loc["EBITDA", latest] if "EBITDA" in income.index else None
# From balance sheet
if balance is not None and not balance.empty:
latest = balance.columns[0]
metrics["total_assets"] = balance.loc["Total Assets", latest] if "Total Assets" in balance.index else None
metrics["total_debt"] = balance.loc["Total Debt", latest] if "Total Debt" in balance.index else None
metrics["cash"] = balance.loc["Cash And Cash Equivalents", latest] if "Cash And Cash Equivalents" in balance.index else None
metrics["stockholder_equity"] = balance.loc["Stockholders Equity", latest] if "Stockholders Equity" in balance.index else None
# From cash flow
if cashflow is not None and not cashflow.empty:
latest = cashflow.columns[0]
metrics["operating_cf"] = cashflow.loc["Operating Cash Flow", latest] if "Operating Cash Flow" in cashflow.index else None
metrics["free_cf"] = cashflow.loc["Free Cash Flow", latest] if "Free Cash Flow" in cashflow.index else None
metrics["capex"] = cashflow.loc["Capital Expenditure", latest] if "Capital Expenditure" in cashflow.index else None
return {k: v for k, v in metrics.items() if v is not None}
print_revenue_trend("GOOGL")
metrics = get_key_metrics("MSFT")
print(f"\nMSFT Key Metrics:")
for k, v in metrics.items():
print(f" {k}: ${v/1e9:.2f}B" if abs(v) > 1e8 else f" {k}: ${v:,.0f}")
Earnings Data
Earnings history and upcoming earnings dates:
def get_earnings_data(ticker: str) -> dict:
"""Pull earnings history and estimates."""
stock = yf.Ticker(ticker)
result = {
"earnings_history": [],
"earnings_estimate": None,
"next_earnings_date": None,
}
# Earnings calendar
try:
calendar = stock.calendar
if calendar is not None:
if isinstance(calendar, dict):
result["next_earnings_date"] = calendar.get("Earnings Date", [None])[0]
elif isinstance(calendar, pd.DataFrame):
result["next_earnings_date"] = calendar.iloc[0, 0] if not calendar.empty else None
except Exception:
pass
# Earnings history (quarterly actuals vs estimates)
try:
earnings = stock.earnings_history
if earnings is not None and not earnings.empty:
for _, row in earnings.iterrows():
result["earnings_history"].append({
"date": str(row.name) if hasattr(row.name, 'strftime') else str(row.name),
"eps_estimate": row.get("epsEstimate"),
"eps_actual": row.get("epsActual"),
"surprise": row.get("surprisePercent"),
})
except Exception:
pass
return result
def get_institutional_holdings(ticker: str) -> list[dict]:
"""Get top institutional shareholders."""
stock = yf.Ticker(ticker)
holders = stock.institutional_holders
if holders is None or holders.empty:
return []
return [
{
"holder": row.get("Holder"),
"shares": row.get("Shares"),
"date_reported": str(row.get("Date Reported", "")),
"pct_held": row.get("% Out"),
"value": row.get("Value"),
}
for _, row in holders.iterrows()
]
# Apple earnings
aapl_earnings = get_earnings_data("AAPL")
print(f"Next AAPL earnings: {aapl_earnings['next_earnings_date']}")
print("\nRecent earnings history:")
for e in aapl_earnings["earnings_history"][-4:]:
surprise = e.get("surprise", 0) or 0
direction = "+" if surprise > 0 else ""
print(f" {e['date']}: Est ${e.get('eps_estimate', '?'):.2f} | "
f"Actual ${e.get('eps_actual', '?'):.2f} | "
f"Surprise: {direction}{surprise:.1f}%")
Method 2: Direct Yahoo Finance API
yfinance breaks periodically when Yahoo changes their authentication flow. When that happens, hit Yahoo's undocumented v8/v10 API directly:
import httpx
import re
import time
import random
class YahooFinanceDirect:
"""
Direct Yahoo Finance API client.
Use as fallback when yfinance breaks, or for fine-grained control.
"""
BASE = "https://query2.finance.yahoo.com"
CONSENT_URL = "https://finance.yahoo.com"
def __init__(self, proxy_url: str = None):
self.proxy = proxy_url
self.client = httpx.Client(
timeout=20,
proxy=proxy_url,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Accept": "application/json, */*",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
},
follow_redirects=True,
)
self.crumb = None
self._init_session()
def _init_session(self):
"""Initialize session cookies and get crumb token."""
# Step 1: Visit the main page to get cookies (including consent cookies)
try:
self.client.get(f"{self.CONSENT_URL}/quote/AAPL", timeout=15)
except Exception:
pass
# Step 2: Get the crumb token
resp = self.client.get(
f"{self.BASE}/v1/test/getcrumb",
headers={"Accept": "text/plain"},
)
if resp.status_code == 200:
self.crumb = resp.text.strip()
else:
raise Exception(f"Failed to get crumb: {resp.status_code}. Try a different proxy.")
def get_quote(self, ticker: str) -> dict:
"""Real-time quote data for a ticker."""
resp = self.client.get(
f"{self.BASE}/v7/finance/quote",
params={
"symbols": ticker,
"crumb": self.crumb,
"fields": "regularMarketPrice,regularMarketChangePercent,regularMarketVolume,"
"marketCap,trailingPE,forwardPE,fiftyTwoWeekHigh,fiftyTwoWeekLow,"
"regularMarketDayHigh,regularMarketDayLow",
}
)
if resp.status_code == 401:
# Crumb expired — reinitialize
self._init_session()
return self.get_quote(ticker)
data = resp.json()
result = data.get("quoteResponse", {}).get("result", [])
return result[0] if result else {}
def get_history(
self,
ticker: str,
range: str = "1y",
interval: str = "1d",
include_pre_post: bool = False,
) -> list[dict]:
"""Historical price data via the v8 chart API."""
resp = self.client.get(
f"{self.BASE}/v8/finance/chart/{ticker}",
params={
"range": range,
"interval": interval,
"includePrePost": str(include_pre_post).lower(),
"crumb": self.crumb,
"events": "div,splits",
}
)
if resp.status_code == 401:
self._init_session()
return self.get_history(ticker, range, interval, include_pre_post)
data = resp.json()
chart = data.get("chart", {})
if chart.get("error"):
print(f"Yahoo API error: {chart['error']}")
return []
result = chart.get("result", [{}])[0]
timestamps = result.get("timestamp", [])
quotes = result.get("indicators", {}).get("quote", [{}])[0]
adjclose_data = result.get("indicators", {}).get("adjclose", [{}])[0]
rows = []
for i, ts in enumerate(timestamps):
o = quotes.get("open", [None] * len(timestamps))[i]
h = quotes.get("high", [None] * len(timestamps))[i]
l = quotes.get("low", [None] * len(timestamps))[i]
c = quotes.get("close", [None] * len(timestamps))[i]
v = quotes.get("volume", [None] * len(timestamps))[i]
adj_c = adjclose_data.get("adjclose", [None] * len(timestamps))[i]
if c is None:
continue # Skip null candles (weekends/holidays in range)
rows.append({
"date": datetime.utcfromtimestamp(ts).strftime("%Y-%m-%d"),
"open": round(o, 4) if o else None,
"high": round(h, 4) if h else None,
"low": round(l, 4) if l else None,
"close": round(c, 4) if c else None,
"adj_close": round(adj_c, 4) if adj_c else None,
"volume": int(v) if v else None,
})
return rows
def get_quote_summary(self, ticker: str, modules: list[str]) -> dict:
"""
Flexible endpoint for any type of fundamental data.
Available modules:
assetProfile, summaryProfile, summaryDetail, price,
defaultKeyStatistics, financialData, earningsTrend,
earningsHistory, incomeStatementHistory, balanceSheetHistory,
cashflowStatementHistory, recommendationTrend, upgradeDowngradeHistory,
institutionOwnership, fundOwnership, majorHoldersBreakdown
"""
resp = self.client.get(
f"{self.BASE}/v10/finance/quoteSummary/{ticker}",
params={
"modules": ",".join(modules),
"crumb": self.crumb,
}
)
if resp.status_code == 401:
self._init_session()
return self.get_quote_summary(ticker, modules)
data = resp.json()
if data.get("quoteSummary", {}).get("error"):
return {"error": data["quoteSummary"]["error"]}
result = data.get("quoteSummary", {}).get("result", [{}])
return result[0] if result else {}
def get_earnings(self, ticker: str) -> list[dict]:
"""Earnings history with EPS estimates vs actuals."""
summary = self.get_quote_summary(ticker, ["earningsHistory"])
history = summary.get("earningsHistory", {}).get("history", [])
return [
{
"quarter": h.get("period", {}).get("fmt") if isinstance(h.get("period"), dict) else h.get("period"),
"date": h.get("quarterDate", {}).get("fmt") if isinstance(h.get("quarterDate"), dict) else None,
"eps_estimate": h.get("epsEstimate", {}).get("raw") if isinstance(h.get("epsEstimate"), dict) else h.get("epsEstimate"),
"eps_actual": h.get("epsActual", {}).get("raw") if isinstance(h.get("epsActual"), dict) else h.get("epsActual"),
"surprise_pct": round(
h.get("surprisePercent", {}).get("raw", 0) * 100
if isinstance(h.get("surprisePercent"), dict)
else (h.get("surprisePercent") or 0) * 100, 1
),
}
for h in history if h.get("epsActual") is not None
]
def get_analyst_recommendations(self, ticker: str) -> list[dict]:
"""Get analyst rating changes history."""
summary = self.get_quote_summary(ticker, ["upgradeDowngradeHistory"])
history = summary.get("upgradeDowngradeHistory", {}).get("history", [])
return [
{
"date": datetime.fromtimestamp(h.get("epochGradeDate", 0)).strftime("%Y-%m-%d"),
"firm": h.get("firm"),
"to_grade": h.get("toGrade"),
"from_grade": h.get("fromGrade"),
"action": h.get("action"),
}
for h in history[:20] # Last 20 rating changes
]
# Usage
yf_api = YahooFinanceDirect() # or YahooFinanceDirect(proxy_url="http://user:pass@proxy:port")
# Real-time quote
quote = yf_api.get_quote("NVDA")
print(f"NVDA: ${quote.get('regularMarketPrice', 'N/A'):.2f} "
f"({quote.get('regularMarketChangePercent', 0):.2f}%)")
# Historical data
history = yf_api.get_history("TSLA", range="6mo", interval="1d")
print(f"\nTSLA 6-month history: {len(history)} bars")
if history:
first_close = history[0]["close"]
last_close = history[-1]["close"]
print(f"6-month return: {((last_close/first_close) - 1) * 100:.1f}%")
# Earnings
nvda_earnings = yf_api.get_earnings("NVDA")
print(f"\nNVDA Recent Earnings:")
for e in nvda_earnings[-4:]:
print(f" {e['quarter']}: Est ${e['eps_estimate']:.2f} | "
f"Actual ${e['eps_actual']:.2f} | "
f"Surprise: {'+' if e['surprise_pct'] > 0 else ''}{e['surprise_pct']:.1f}%")
Batch Scraping Multiple Tickers
When tracking a portfolio or running screens across hundreds of tickers:
import sqlite3
import time
import random
def setup_stock_database(db_path: str) -> sqlite3.Connection:
"""Create SQLite schema for stock data."""
conn = sqlite3.connect(db_path)
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("""
CREATE TABLE IF NOT EXISTS quotes (
ticker TEXT,
price REAL,
change_pct REAL,
market_cap INTEGER,
pe_ratio REAL,
volume INTEGER,
high_52w REAL,
low_52w REAL,
timestamp TEXT,
PRIMARY KEY (ticker, timestamp)
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS price_history (
ticker TEXT,
date TEXT,
open REAL,
high REAL,
low REAL,
close REAL,
adj_close REAL,
volume INTEGER,
PRIMARY KEY (ticker, date)
)
""")
conn.commit()
return conn
def batch_download_prices(
tickers: list[str],
period: str = "1y",
db_path: str = "stocks.db",
) -> dict:
"""
Download historical prices for multiple tickers using yfinance's batch mode.
yfinance's download() is significantly faster than calling Ticker() one by one.
"""
print(f"Downloading {period} history for {len(tickers)} tickers...")
# yfinance supports batch downloads — much faster than individual requests
data = yf.download(
tickers,
period=period,
group_by="ticker",
auto_adjust=True,
progress=True,
threads=True, # Use threads for parallel downloads
)
conn = setup_stock_database(db_path)
saved_by_ticker = {}
for ticker in tickers:
try:
if len(tickers) == 1:
ticker_data = data
else:
ticker_data = data[ticker]
if ticker_data.empty:
print(f" {ticker}: No data")
continue
saved = 0
for date, row in ticker_data.iterrows():
date_str = date.strftime("%Y-%m-%d")
try:
conn.execute(
"INSERT OR REPLACE INTO price_history VALUES (?,?,?,?,?,?,?,?)",
(
ticker, date_str,
round(float(row.get("Open", 0) or 0), 4),
round(float(row.get("High", 0) or 0), 4),
round(float(row.get("Low", 0) or 0), 4),
round(float(row.get("Close", 0) or 0), 4),
round(float(row.get("Close", 0) or 0), 4), # adj_close same as close when auto_adjust=True
int(row.get("Volume", 0) or 0),
)
)
saved += 1
except Exception as e:
pass
saved_by_ticker[ticker] = saved
except Exception as e:
print(f" {ticker}: Error — {e}")
saved_by_ticker[ticker] = 0
conn.commit()
conn.close()
return saved_by_ticker
def batch_scrape_fundamentals(
tickers: list[str],
db_path: str = "stocks.db",
delay: float = 0.5,
) -> int:
"""Scrape current quotes and fundamentals for multiple tickers."""
conn = setup_stock_database(db_path)
now = datetime.now().isoformat()
saved = 0
for i, ticker in enumerate(tickers):
try:
stock = yf.Ticker(ticker)
info = stock.info
if not info.get("currentPrice") and not info.get("regularMarketPrice"):
print(f" {ticker}: No price data")
continue
conn.execute(
"INSERT OR REPLACE INTO quotes VALUES (?,?,?,?,?,?,?,?,?)",
(
ticker,
info.get("currentPrice") or info.get("regularMarketPrice"),
info.get("regularMarketChangePercent"),
info.get("marketCap"),
info.get("trailingPE"),
info.get("averageVolume"),
info.get("fiftyTwoWeekHigh"),
info.get("fiftyTwoWeekLow"),
now,
)
)
saved += 1
if (i + 1) % 10 == 0:
conn.commit()
print(f" Progress: {i+1}/{len(tickers)}")
except Exception as e:
print(f" {ticker}: Error — {e}")
# Rate control — Yahoo throttles after ~100 rapid requests
time.sleep(random.uniform(delay, delay * 2))
conn.commit()
conn.close()
return saved
# S&P 500 sample
sp500_sample = [
"AAPL", "MSFT", "NVDA", "GOOGL", "AMZN", "META", "TSLA", "BRK-B",
"UNH", "JPM", "V", "XOM", "JNJ", "MA", "HD", "CVX", "ABBV", "MRK",
"PG", "AVGO",
]
# Batch download prices
saved = batch_download_prices(sp500_sample, period="1y")
print(f"\nDownloaded prices: {sum(saved.values())} total bars")
# Scrape fundamentals
count = batch_scrape_fundamentals(sp500_sample)
print(f"Saved {count}/{len(sp500_sample)} fundamental records")
When Yahoo Blocks You
Yahoo Finance blocks when you cross their invisible rate limits. Symptoms:
- 401 errors on requests that previously worked
- Empty result arrays in API responses
- Redirects to a cookie consent page
- yfinance raising YFRateLimitError
What triggers blocks: - More than ~2,000 requests/hour from a single IP - Rapid succession requests without any delay - Datacenter IP ranges (AWS, GCP are pre-flagged) - Missing or expired crumb token without proper cookie
Fixes in order of escalation:
Level 1: Add delays
import time
import random
time.sleep(random.uniform(0.5, 1.5)) # Between each request
Level 2: Refresh the crumb
# Force yfinance to get a fresh session
import yfinance as yf
yf.utils.get_crumb_and_cookies.cache_clear()
Level 3: Use proxies
Residential proxies from ThorData work well with Yahoo Finance — their IPs come from real ISPs rather than datacenter ranges, so Yahoo's ASN-based blocking doesn't flag them:
import os
# Method 1: Environment variable (yfinance respects this)
os.environ["HTTPS_PROXY"] = "http://user:[email protected]:7777"
os.environ["HTTP_PROXY"] = "http://user:[email protected]:7777"
# Method 2: Use direct API client with explicit proxy
yf_api = YahooFinanceDirect(proxy_url="http://user:[email protected]:7777")
# Method 3: Pass proxy to yfinance Ticker (newer versions support this)
stock = yf.Ticker("AAPL", proxy="http://user:[email protected]:7777")
Handling Data Quality Issues
Yahoo Finance data isn't always clean. Common problems:
import numpy as np
def safe_get_history(
ticker: str,
period: str = "1y",
retries: int = 3,
proxy: str = None,
) -> pd.DataFrame | None:
"""Fetch history with retry logic and data validation."""
for attempt in range(retries):
try:
if proxy:
os.environ["HTTPS_PROXY"] = proxy
stock = yf.Ticker(ticker)
df = stock.history(period=period, auto_adjust=True)
if df.empty:
print(f" {ticker}: Empty response (attempt {attempt+1})")
time.sleep(2 ** attempt)
continue
# Check for data quality issues
null_pct = df["Close"].isnull().mean()
if null_pct > 0.05:
print(f" {ticker}: {null_pct:.0%} null values in Close")
# Detect suspiciously large gaps (possible splits not adjusted)
returns = df["Close"].pct_change().dropna()
extreme = (returns.abs() > 0.5).sum()
if extreme > 0:
print(f" {ticker}: {extreme} extreme price moves (>50%) — check for splits")
# Remove outliers caused by data errors (not split adjustments)
df["Close"] = df["Close"].replace(0, np.nan)
df["Close"] = df["Close"].ffill() # Forward fill small gaps
# Remove timezone
if hasattr(df.index, "tz") and df.index.tz is not None:
df.index = df.index.tz_localize(None)
return df.round(4)
except Exception as e:
print(f" {ticker}: {e} (attempt {attempt+1})")
time.sleep(2 ** attempt)
return None
# Validate a batch of tickers
def validate_tickers(tickers: list[str]) -> dict:
"""Check which tickers have valid, complete data."""
results = {}
for ticker in tickers:
df = safe_get_history(ticker, period="3mo")
if df is None or df.empty:
results[ticker] = "no_data"
elif df["Close"].isnull().mean() > 0.1:
results[ticker] = "too_many_nulls"
else:
results[ticker] = "ok"
return results
Alternative Free Sources
If Yahoo Finance goes down or changes too much:
| Source | What You Get | Limits | Cost |
|---|---|---|---|
| FRED | Macro data, interest rates, economic indicators | Generous, free API key | Free |
| SEC EDGAR | Company filings (10-K, 10-Q, 8-K) | No limits | Free |
| Polygon.io | Real-time + historical quotes | 5 calls/min (free tier) | Free/Paid |
| Financial Modeling Prep | Comprehensive financials API | 250 req/day (free) | Free/Paid |
| Alpha Vantage | Historical + real-time prices | 25 req/day (free) | Free/Paid |
| Quandl/Nasdaq Data Link | Financial datasets | Limited free tier | Free/Paid |
import httpx
def get_fred_series(series_id: str, api_key: str) -> list[dict]:
"""
Fetch economic data from FRED (Federal Reserve Economic Data).
Series examples: GDP, UNRATE, FEDFUNDS, T10Y2Y, CPIAUCSL
Free API key at: fred.stlouisfed.org/docs/api/api_key.html
"""
resp = httpx.get(
"https://api.stlouisfed.org/fred/series/observations",
params={
"series_id": series_id,
"api_key": api_key,
"file_type": "json",
"sort_order": "desc",
"limit": 100,
}
)
data = resp.json()
return [
{
"date": obs["date"],
"value": float(obs["value"]) if obs["value"] != "." else None,
}
for obs in data.get("observations", [])
]
# Federal Funds Rate
# fed_rate = get_fred_series("FEDFUNDS", "your_fred_api_key")
Key Takeaways
Yahoo Finance with yfinance remains the lowest-friction path for free stock data. Build your code so the data source is easily swappable — use an interface or adapter pattern so you can point at an alternative when Yahoo inevitably changes something:
yfinanceis the starting point: handles crumbs and cookies automaticallyYahooFinanceDirectis your fallback when yfinance breaks (happens every few months)- Rate limits are invisible but real — add 0.5-1s delays between requests
- For high-volume screening, rotating residential proxies from ThorData bypass Yahoo's ASN-based blocking
- Always validate data quality — null values, extreme moves, and missing periods are common
- For macro/economic data alongside stock prices, combine Yahoo with FRED for a complete picture