import pandas as pd

# Sample DataFrames
df1 = pd.DataFrame({
    'A': ['A0', 'A1', 'A2'],
    'B': ['B0', 'B1', 'B2']
})

df2 = pd.DataFrame({
    'A': ['A3', 'A4', 'A5'],
    'B': ['B3', 'B4', 'B5']
})

# pd.concat - stacking vertically
concat_vertical = pd.concat([df1, df2], axis=0)
print(df1, df2, concat_vertical, sep='\n\n')

    A   B
0  A0  B0
1  A1  B1
2  A2  B2

    A   C
0  A3  B3
1  A4  B4
2  A5  B5

    A    B    C
0  A0   B0  NaN
1  A1   B1  NaN
2  A2   B2  NaN
0  A3  NaN   B3
1  A4  NaN   B4
2  A5  NaN   B5

# pd.concat - stacking horizontally

df3 = pd.DataFrame({
    'C': ['C0', 'C1', 'C2'],
    'D': ['D0', 'D1', 'D2']
})
concat_horizontal = pd.concat([df1, df3], axis=1)
print(df1)
print(df3)
print(concat_horizontal)

    A   B
0  A0  B0
1  A1  B1
2  A2  B2
    C   D
0  C0  D0
1  C1  D1
2  C2  D2
    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1
2  A2  B2  C2  D2

# df.join - joining on index
df4 = pd.DataFrame({
    'E': ['E0', 'E1', 'E2']
}, index=[0, 1, 2])

joined = df1.join(df4)
print(df1, df4, joined, sep='\n\n')

    A   B
0  A0  B0
1  A1  B1
2  A2  B2

    E
0  E0
1  E1
2  E2

    A   B   E
0  A0  B0  E0
1  A1  B1  E1
2  A2  B2  E2

# df.merge - merging on a common column
df5 = pd.DataFrame({
    'A': ['A0', 'A1', 'A2'],
    'C': ['C0', 'C1', 'C2']
})

merged = df1.merge(df5, on='A')
print(df1, df5, merged, sep='\n\n')

    A   B
0  A0  B0
1  A1  B1
2  A2  B2

    A   C
0  A0  C0
1  A1  C1
2  A2  C2

    A   B   C
0  A0  B0  C0
1  A1  B1  C1
2  A2  B2  C2

import numpy as np
import pandas as pd
import wrds       
conn = wrds.Connection()

WRDS recommends setting up a .pgpass file.
You can create this file yourself at any time
with the create_pgpass_file() function.
Loading library list...
Done

comp = conn.raw_sql(
    """
    select gvkey, datadate, at
    from comp.funda
    where datadate >= '2000-01-01' and at>0 
    and indfmt='INDL' and datafmt='STD' and popsrc='D' and consol='C'
    order by gvkey, datadate
    """, 
    date_cols=['datadate']
)

# convert string or float to int
comp.gvkey = comp.gvkey.astype(int)

def days_between(dates) :
    days = dates.diff()                                          
    return days.map(lambda d: d.days)

comp["days"] = comp.groupby("gvkey").datadate.apply(days_between)

comp["inv"] = comp.groupby("gvkey")["at"].pct_change()
comp["inv"] = np.where((comp.days).notnull() & (comp.days>720), np.nan, comp.inv)

comp[comp.gvkey==1019].head()

# define date as June 30 of year following datadate
comp['date'] = pd.to_datetime(
    comp.datadate.map(
        lambda d: str(d.year+1)+'-06-30'
    )
)
    
# if two annual reports in one calendar year (due to change of fiscal year), keep last one
comp = comp.drop_duplicates(
    subset=['gvkey', 'date'], keep='last'
)

link = conn.raw_sql(
    """
    select distinct gvkey, lpermno as permno, linkdt, linkenddt
    from crsp.Ccmxpf_linktable
    where linktype in ('LU', 'LC')
    and LINKPRIM in ('P', 'C')
    """
)

# convert strings or floats to ints
link['gvkey'] = link.gvkey.astype(int)
link['permno'] = link.permno.astype(int)

# fill in missing end dates with a future date
link['linkenddt'] = pd.to_datetime(
    link.linkenddt
).fillna(pd.Timestamp('21000101'))

# merge with Compustat data and keep rows with Compustat datadate between link date and link end date
comp = comp.merge(link, on='gvkey', how='inner')
comp = comp[
    (comp.datadate>=comp.linkdt) & (comp.datadate<=comp.linkenddt)
]

comp = comp.drop(columns=['days', 'gvkey', 'datadate', 'linkdt', 'linkenddt'])

crsp = conn.raw_sql(
    """
    select a.permno, a.permco, a.date, a.ret, abs(a.prc)*a.shrout as me, b.exchcd
    from crsp.msf a inner join crsp.msenames b
    on a.permno=b.permno and a.date between b.namedt and b.nameendt
    and b.exchcd in (1,2,3) and b.shrcd in (10,11)
    where a.date >= '2000-01-01'
    order by a.permno, a.date
    """,
date_cols=['date']
)

# change strings or floats to integers
for col in ['permno','permco']:
    crsp[col] = crsp[col].astype(int)

# define market equity as sum of market equities of all permnos associated with a permco
crsp['me'] = crsp.groupby(['date','permco']).me.transform(sum)

mse = conn.raw_sql(
    """
    select permno, dlret, dlstcd
    from crsp.mse
    where event='DELIST' and dlstcd>100
    order by permno
    """
)

# change string or float to int
mse['permno'] = mse.permno.astype(int)

# merge with crsp, keeping all rows of crsp
crsp = crsp.merge(mse, how='left', on='permno')
del mse

# series of True and False, True if it is the last date for a stock
LastObs = crsp.permno != crsp.permno.shift(-1)

# series of True and False, True if delisted for poor performance
DLCode = (crsp.dlstcd==500) | ((crsp.dlstcd >=520)&(crsp.dlstcd<=584))

# -35% if no delisting return, delisted for poor performance and NYSE/AMEX
crsp['dlret'] = np.where(DLCode & crsp.dlret.isnull() & crsp.exchcd.isin([1,2]), -0.35, crsp.dlret) 

# -55% if no delisting return, delisted for poor performance and Nasdaq
crsp['dlret'] = np.where(DLCode & crsp.dlret.isnull() & (crsp.exchcd==3), -0.55, crsp.dlret)

# if delisting return exists and < -1, change to -1
crsp['dlret'] = np.where(crsp.dlret.notnull() & crsp.dlret<-1,-1,crsp.dlret)

# if last day and return exists, define return by compounding with delisting return (if exists)      
crsp['ret'] = np.where(LastObs & crsp.ret.notnull(), (1+crsp.ret)*(1+crsp.dlret.fillna(0))-1, crsp.ret)

# if last day and return does not exist, define return as delisting return
crsp['ret'] = np.where(LastObs & crsp.ret.isnull(), crsp.dlret, crsp.ret)

crsp.date = crsp.date.dt.to_period('M')
comp.date = comp.date.dt.to_period('M')

df = crsp.merge(comp, on=['permno', 'date'], how='left')

df[['at', 'inv']] = df.groupby('permno')[['at', 'inv']].ffill(limit=11)

df.head()

df[df.days>720].gvkey.unique()

array([177405.,  28877.,  30950.,  15253.,  61697., 120354., 114747.,
       135425.,  25632.])

comp = conn.raw_sql(
    """
    select gvkey, datadate, pstkrv, pstkl, pstk
    from comp.funda 
    where datadate >= '2000-01-01' 
    and indfmt='INDL' and datafmt='STD' and popsrc='D' and consol='C'
    order by gvkey, datadate
    """, 
    date_cols=['datadate']
)

comp['preferred'] = comp[['pstkrv','pstkl','pstk']].bfill(axis=1).iloc[:, 0].fillna(0)

comp[comp.preferred != 0].head()

comp[(comp.pstkrv.isna()) & (comp.preferred != 0)].head()

	gvkey	datadate	at	days	inv
38	1019	2000-12-31	28.638	NaN	NaN
39	1019	2001-12-31	30.836	365.0	0.076751
40	1019	2007-12-31	34.180	2191.0	NaN
41	1019	2008-12-31	33.486	366.0	-0.020304
42	1019	2009-12-31	32.678	365.0	-0.024129

	permno	permco	date	ret	me	exchcd	dlret	dlstcd	at	days	inv
0	10001	7953	2000-01	-0.044118	19906.25	3.0	0.011583	233.0	NaN	NaN	NaN
1	10001	7953	2000-02	0.015385	20212.50	3.0	0.011583	233.0	NaN	NaN	NaN
2	10001	7953	2000-03	-0.015758	19712.00	3.0	0.011583	233.0	NaN	NaN	NaN
3	10001	7953	2000-04	0.011719	19943.00	3.0	0.011583	233.0	NaN	NaN	NaN
4	10001	7953	2000-05	-0.023166	19481.00	3.0	0.011583	233.0	NaN	NaN	NaN

	gvkey	datadate	pstkrv	pstkl	pstk	preferred
23	001010	2000-12-31	11.225	9.600	9.600	11.225
24	001010	2001-12-31	48.070	48.070	48.100	48.070
25	001010	2002-12-31	70.209	70.209	70.200	70.209
65	001021	2006-06-30	4.744	4.744	4.744	4.744
77	001037	2000-03-31	2.583	2.583	2.583	2.583

	gvkey	datadate	pstkrv	pstkl	pstk	preferred
1303	001429	2000-08-31	NaN	NaN	38.275	38.275
1304	001429	2001-08-31	NaN	NaN	38.275	38.275
2857	001932	2000-12-31	NaN	NaN	44.865	44.865
2858	001932	2001-12-31	NaN	NaN	43.629	43.629
2859	001932	2002-12-31	NaN	49.895	49.895	49.895

Day 3¶

BUSI 520: Python for Business Research¶

Kerry Back, JGSB, Rice University¶

Topics for today¶

Warm-up¶

Merging dataframes¶

Another example¶

Plotting libraries¶

Ask Julius to¶

Plotly: Ask Julius to¶

Determinants of wages¶

WRDS¶

Annual Compustat¶

Days between datadates¶

Investment rate¶

Use Fama-French lagging¶

Assign permnos if merging with CRSP¶

CRSP¶

Define delisting returns¶

Merge CRSP with Compustat¶

Fill Compustat data into months¶

Check result¶

Coalescing Compustat data¶