import numpy as np
import pandas as pd
import wrds
conn = wrds.Connection()
WRDS recommends setting up a .pgpass file. You can create this file yourself at any time with the create_pgpass_file() function. Loading library list... Done
An example of pulling data from the annual Compustat table. datadate is the end of the fiscal year. We impose standard filters. See https://wrds-web.wharton.upenn.edu/wrds/demo/demoform_compustat.cfm for a full list of Compustat variable definitions. Quarterly data is also available.
comp = conn.raw_sql(
"""
select gvkey, datadate, at
from comp.funda
where datadate >= '2000-01-01' and at>0
and indfmt='INDL' and datafmt='STD' and popsrc='D' and consol='C'
order by gvkey, datadate
""",
date_cols=['datadate']
)
# convert string or float to int
comp.gvkey = comp.gvkey.astype(int)
def days_between(dates) :
days = dates.diff()
return days.map(lambda d: d.days)
comp["days"] = comp.groupby("gvkey").datadate.apply(days_between)
Set it to missing if more than x days between datadates
comp["inv"] = comp.groupby("gvkey")["at"].pct_change()
comp["inv"] = np.where((comp.days).notnull() & (comp.days>720), np.nan, comp.inv)
Example
comp[comp.gvkey==1019].head()
gvkey | datadate | at | days | inv | |
---|---|---|---|---|---|
38 | 1019 | 2000-12-31 | 28.638 | NaN | NaN |
39 | 1019 | 2001-12-31 | 30.836 | 365.0 | 0.076751 |
40 | 1019 | 2007-12-31 | 34.180 | 2191.0 | NaN |
41 | 1019 | 2008-12-31 | 33.486 | 366.0 | -0.020304 |
42 | 1019 | 2009-12-31 | 32.678 | 365.0 | -0.024129 |
Shift all annual reports in a calendar year to June 30 of the following year.
# define date as June 30 of year following datadate
comp['date'] = pd.to_datetime(comp.datadate.map(lambda d: str(d.year+1)+'-06-30'))
# if two annual reports in one calendar year (due to change of fiscal year), keep last one
comp = comp.drop_duplicates(subset=['gvkey', 'date'], keep='last')
link = conn.raw_sql(
"""
select distinct gvkey, lpermno as permno, linkdt, linkenddt
from crsp.Ccmxpf_linktable
where linktype in ('LU', 'LC')
and LINKPRIM in ('P', 'C')
"""
)
# convert strings or floats to ints
link['gvkey'] = link.gvkey.astype(int)
link['permno'] = link.permno.astype(int)
# fill in missing end dates with a future date
link['linkenddt'] = pd.to_datetime(link.linkenddt).fillna(pd.Timestamp('21000101'))
# merge with Compustat data and keep rows with Compustat datadate between link date and link end date
comp = comp.merge(link, on='gvkey', how='inner')
comp = comp[(comp.datadate>=comp.linkdt) & (comp.datadate<=comp.linkenddt)]
comp = comp.drop(columns=['days', 'gvkey', 'datadate', 'linkdt', 'linkenddt'])
Get stock prices and returns from CRSP (Center for Research in Security Prices). We use standard filters. See http://www.crsp.com/files/data_descriptions_guide_0.pdf for a complete set of variable definitions. CRSP uses PERMCO as a permanent company identifier and PERMNO as a permanent security identifier. Some companies have multiple classes of common stock, which means multiple common stock PERMNOs can be associated with a single PERMCO. Both monthly and daily data are available.
crsp = conn.raw_sql(
"""
select a.permno, a.permco, a.date, a.ret, abs(a.prc)*a.shrout as me, b.exchcd
from crsp.msf a inner join crsp.msenames b
on a.permno=b.permno and a.date between b.namedt and b.nameendt
and b.exchcd in (1,2,3) and b.shrcd in (10,11)
where a.date >= '2000-01-01'
order by a.permno, a.date
""",
date_cols=['date']
)
# change strings or floats to integers
for col in ['permno','permco']:
crsp[col] = crsp[col].astype(int)
# define market equity as sum of market equities of all permnos associated with a permco
crsp['me'] = crsp.groupby(['date','permco']).me.transform(sum)
This is always done, but there are some different ways to do it. Here, we follow some of the literature and assign a lower delisting return to Nasdaq stocks than to NYSE/AMEX stocks if the delisting return is missing and delisting was due to poor performance.
mse = conn.raw_sql(
"""
select permno, dlret, dlstcd
from crsp.mse
where event='DELIST' and dlstcd>100
order by permno
"""
)
# change string or float to int
mse['permno'] = mse.permno.astype(int)
# merge with crsp, keeping all rows of crsp
crsp = crsp.merge(mse, how='left', on='permno')
del mse
# series of True and False, True if it is the last date for a stock
LastObs = crsp.permno != crsp.permno.shift(-1)
# series of True and False, True if delisted for poor performance
DLCode = (crsp.dlstcd==500) | ((crsp.dlstcd >=520)&(crsp.dlstcd<=584))
# -35% if no delisting return, delisted for poor performance and NYSE/AMEX
crsp['dlret'] = np.where(DLCode & crsp.dlret.isnull() & crsp.exchcd.isin([1,2]), -0.35, crsp.dlret)
# -55% if no delisting return, delisted for poor performance and Nasdaq
crsp['dlret'] = np.where(DLCode & crsp.dlret.isnull() & (crsp.exchcd==3), -0.55, crsp.dlret)
# if delisting return exists and < -1, change to -1
crsp['dlret'] = np.where(crsp.dlret.notnull() & crsp.dlret<-1,-1,crsp.dlret)
# if last day and return exists, define return by compounding with delisting return (if exists)
crsp['ret'] = np.where(LastObs & crsp.ret.notnull(), (1+crsp.ret)*(1+crsp.dlret.fillna(0))-1, crsp.ret)
# if last day and return does not exist, define return as delisting return
crsp['ret'] = np.where(LastObs & crsp.ret.isnull(), crsp.dlret, crsp.ret)
crsp.date = crsp.date.dt.to_period('M')
comp.date = comp.date.dt.to_period('M')
df = crsp.merge(comp, on=['permno', 'date'], how='left')
df[['at', 'inv']] = df.groupby('permno')[['at', 'inv']].ffill(limit=11)
df.head()
permno | permco | date | ret | me | exchcd | dlret | dlstcd | at | days | inv | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10001 | 7953 | 2000-01 | -0.044118 | 19906.25 | 3.0 | 0.011583 | 233.0 | NaN | NaN | NaN |
1 | 10001 | 7953 | 2000-02 | 0.015385 | 20212.50 | 3.0 | 0.011583 | 233.0 | NaN | NaN | NaN |
2 | 10001 | 7953 | 2000-03 | -0.015758 | 19712.00 | 3.0 | 0.011583 | 233.0 | NaN | NaN | NaN |
3 | 10001 | 7953 | 2000-04 | 0.011719 | 19943.00 | 3.0 | 0.011583 | 233.0 | NaN | NaN | NaN |
4 | 10001 | 7953 | 2000-05 | -0.023166 | 19481.00 | 3.0 | 0.011583 | 233.0 | NaN | NaN | NaN |
df[df.days>720].gvkey.unique()
array([177405., 28877., 30950., 15253., 61697., 120354., 114747., 135425., 25632.])
Often one fills in a missing Compustat variable with another variable or with some calculation involving other variables. An example is the calculation of preferred stock by Fama and French when they calculate book equity. They use pstkrv if it is not missing; if it is missing, they use pstkl; if both of those are missing, they use pstk; if all three of those are missing they set it to zero.
Here is an implementation using bfill. axis=1 means fill across rows. With axis=1, bfill fills from the right, going to the left. It produces an array of the same shape that you started with. The .iloc[:,0] produces the first (zero-th) column of the array.
comp = conn.raw_sql(
"""
select gvkey, datadate, pstkrv, pstkl, pstk
from comp.funda
where datadate >= '2000-01-01'
and indfmt='INDL' and datafmt='STD' and popsrc='D' and consol='C'
order by gvkey, datadate
""",
date_cols=['datadate']
)
comp['preferred'] = comp[['pstkrv','pstkl','pstk']].bfill(axis=1).iloc[:, 0].fillna(0)
comp[comp.preferred != 0].head()
gvkey | datadate | pstkrv | pstkl | pstk | preferred | |
---|---|---|---|---|---|---|
23 | 001010 | 2000-12-31 | 11.225 | 9.600 | 9.600 | 11.225 |
24 | 001010 | 2001-12-31 | 48.070 | 48.070 | 48.100 | 48.070 |
25 | 001010 | 2002-12-31 | 70.209 | 70.209 | 70.200 | 70.209 |
65 | 001021 | 2006-06-30 | 4.744 | 4.744 | 4.744 | 4.744 |
77 | 001037 | 2000-03-31 | 2.583 | 2.583 | 2.583 | 2.583 |
comp[(comp.pstkrv.isna()) & (comp.preferred != 0)].head()
gvkey | datadate | pstkrv | pstkl | pstk | preferred | |
---|---|---|---|---|---|---|
1303 | 001429 | 2000-08-31 | NaN | NaN | 38.275 | 38.275 |
1304 | 001429 | 2001-08-31 | NaN | NaN | 38.275 | 38.275 |
2857 | 001932 | 2000-12-31 | NaN | NaN | 44.865 | 44.865 |
2858 | 001932 | 2001-12-31 | NaN | NaN | 43.629 | 43.629 |
2859 | 001932 | 2002-12-31 | NaN | 49.895 | 49.895 | 49.895 |