import numpy as np
import pandas as pd
import pandas_datareader as pdr
from scipy.stats import ttest_1samp, ttest_ind
import statsmodels.api as sm
import statsmodels.formula.api as smf
from linearmodels.panel import PanelOLS, FamaMacBeth
from linearmodels.iv import IV2SLS
from linearmodels.system import SUR
from statsmodels.regression.rolling import RollingOLS
from statsmodels.tsa.api import VAR
from pystout import pystout
# use result.statistic and result.pvalue to get t-stat and p-value
sampleA = np.random.normal(size=100)
sampleB = np.random.normal(loc=0.1, scale=2, size=150)
# test of H0: mean = 0
result1 = ttest_1samp(sampleB, popmean=0)
# test of H0: meanA = meanB assuming equal variances
result2 = ttest_ind(sampleA, sampleB)
# test 0f H0: meanA = meanB allowing unequal variances
result3 = ttest_ind(sampleA, sampleB, equal_var=False)
data = pd.read_csv("WAGE1_revised.csv")
# use result.summary()
model1 = sm.OLS(endog=data.wage, exog=sm.add_constant(data.exper))
result1 = model1.fit()
model2 = smf.ols("wage ~ female", data=data)
result2 = model2.fit()
# multivariate
model3 = smf.ols("wage ~ female + educ", data=data)
result3 = model3.fit()
# transformations of variables
model4 = smf.ols("wage ~ female + educ + np.log(exper)", data=data)
result4 = model4.fit()
# interactions
model5 = smf.ols("wage ~ female + educ + female*educ + np.log(exper)", data=data)
result5 = model5.fit()
# dummy variables
model6 = smf.ols(
"wage ~ female + educ + female*educ + np.log(exper) + C(area)",
data=data
)
result6 = model6.fit()
# regression without an intercept
model7 = smf.ols(
"wage ~ female + educ + female*educ + np.log(exper) + C(area) - 1",
data=data
)
result7 = model7.fit()
The linearmodels package was created by Kevin Sheppard. Many of the following examples come from the linearmodels user guide https://bashtage.github.io/linearmodels/.
# use result.summary
from linearmodels.datasets import wage
data = wage.load()
data = data.dropna(subset=["educ", "wage", "sibs", "exper"])
model1 = IV2SLS.from_formula("np.log(wage) ~ exper + [educ ~ sibs]", data=data)
result1 = model1.fit(cov_type="robust")
from linearmodels.datasets import fringe
data = fringe.load()
formula = """
{hrbens ~ educ + exper + union + south + nrtheast + nrthcen + male}
{hrearn ~ educ + exper + nrtheast + married + male}
"""
model2 = SUR.from_formula(formula, data=data)
result2 = model2.fit(cov_type="robust")
To use the formula version of linearmodels with fixed effects, create a multi-index for the dataframe with the outside (first) index being "entity" and the inside (second) index being "time." For other fixed effects, use the basic (non-formula) version of linearmodels.
from linearmodels.datasets import wage_panel
data = wage_panel.load()
data = data.set_index(["nr", "year"])
# entity fixed effects
model3 = PanelOLS.from_formula(
"lwage ~ exper + EntityEffects",
data=data,
)
result3 = model3.fit(cov_type="clustered", cluster_entity=True)
# time fixed effects
model4 = PanelOLS.from_formula(
"lwage ~ exper + married + black + TimeEffects",
data=data,
)
result4 = model4.fit(cov_type="clustered", cluster_time=True)
# time and entity fixed effects
model5 = PanelOLS.from_formula(
"lwage ~ exper + EntityEffects + TimeEffects",
data=data,
)
result5 = model5.fit(cov_type="clustered", cluster_entity=True, cluster_time=True)
Run cross-sectional regressions and then use $t$ tests for the means of the time series of cross-sectional coefficients.
# A small data set with acc=accruals and agr=asset growth,
# monthly data since 2010, roughly 2,000 stocks per month.
data = pd.read_csv("https://www.dropbox.com/s/012c6y4gxsxss6y/ghz.csv?dl=1", parse_dates=["date"])
data = data.sort_values(by=['permno', 'date'])
data = data.set_index(["permno", "date"])
# winsorize and standardize cross sections
data.agr = np.log(1+data.agr)
def winsorize(ser):
return ser.clip(lower=ser.quantile(0.01), upper=ser.quantile(0.99))
for char in ["acc", "agr"]:
data[char] = data.groupby("date")[char].apply(winsorize)
data[char] = data.groupby("date")[char].apply(lambda x: x / x.std())
C:\Users\kerry\AppData\Local\Temp\ipykernel_20000\462842168.py:9: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True) data[char] = data.groupby("date")[char].apply(winsorize) C:\Users\kerry\AppData\Local\Temp\ipykernel_20000\462842168.py:10: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True) data[char] = data.groupby("date")[char].apply(lambda x: x / x.std()) C:\Users\kerry\AppData\Local\Temp\ipykernel_20000\462842168.py:9: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True) data[char] = data.groupby("date")[char].apply(winsorize) C:\Users\kerry\AppData\Local\Temp\ipykernel_20000\462842168.py:10: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True) data[char] = data.groupby("date")[char].apply(lambda x: x / x.std())
# run Fama-MacBeth
model6 = FamaMacBeth.from_formula("ret ~ acc + agr + 1", data=data)
result6 = model6.fit(cov_type="kernel", kernel="Bartlett", bandwidth=12)
# Fama-MacBeth "by hand"
# cross-sectional regression function
def xreg(df):
model = smf.ols("ret ~ acc + agr", data=df)
result = model.fit()
return result.params
# apply to each cross-section to get time series of coefficients
fm = data.groupby('date').apply(xreg)
# run t-tests by OLS to get Newey-West standard errors
model7a = smf.ols("acc ~ 1", data=fm)
result7a = model7a.fit(cov_type='HAC', cov_kwds={"kernel": "bartlett", "maxlags": 12})
model7b = smf.ols("agr ~ 1", data=fm)
result7b = model7b.fit(cov_type='HAC', cov_kwds={"kernel": "bartlett", "maxlags": 12})
For Fama-MacBeth, we run cross-sectional regressions at each date. For this exercise, we will run time-series regressions for each entity (stock). We'll run the time-series regressions over rolling windows.
The time series regressions are for stock returns on Fama-French factors. It is common to use 60 months as the window but to include all stock/months for which 24 past months were available in the prior 60 months. We do that with window=60, min_nobs=24, and expanding=True.
The RollingOLS function crashes if you specify a window size that is larger than the number of rows in the data frame. So, we construct a function to "pass" if the number of rows is less than 24 and specify the window size as the smaller of 60 and the number of rows.
ff = pdr.DataReader("F-F_Research_Data_Factors", "famafrench", start=2000)[0] / 100
data = data.merge(ff, left_on='date', right_index=True)
data = data.rename(columns={"Mkt-RF": "Mkt_RF"})
data["ret_RF"] = data.ret - data.RF
data = data.dropna(subset=["ret_RF", "Mkt_RF", "SMB", "HML"])
def rolling_betas(df):
n = df.shape[0]
if n >= 24:
data = df.set_index("date")
model = RollingOLS.from_formula(
"ret_RF ~ Mkt_RF + SMB + HML",
window=min(n, 60),
min_nobs=24,
expanding=True,
data=data
)
result = model.fit()
return result.params[['Mkt_RF', 'SMB', 'HML']].dropna()
else:
pass
betas = data.reset_index().groupby("permno").apply(rolling_betas)
C:\Users\kerry\AppData\Local\Temp\ipykernel_20000\998454947.py:17: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True) betas = data.reset_index().groupby("permno").apply(rolling_betas)
model = VAR(ff)
result = model.fit()
data = pd.read_csv("WAGE1_revised.csv")
# transformations of variables
model1 = smf.ols("wage ~ female", data=data)
result1 = model1.fit()
# multivariate
model2 = smf.ols("wage ~ female + educ + female*educ", data=data)
result2 = model2.fit()
# saving to Excel
pd.DataFrame(result2.summary().tables[1]).to_excel("excelfile.xlsx")
--------------------------------------------------------------------------- PermissionError Traceback (most recent call last) c:\Users\kerry\repos\busi520-development\docs\11-Statistics.ipynb Cell 29 line 1 <a href='vscode-notebook-cell:/c%3A/Users/kerry/repos/busi520-development/docs/11-Statistics.ipynb#X54sZmlsZQ%3D%3D?line=8'>9</a> result2 = model2.fit() <a href='vscode-notebook-cell:/c%3A/Users/kerry/repos/busi520-development/docs/11-Statistics.ipynb#X54sZmlsZQ%3D%3D?line=10'>11</a> # saving to Excel ---> <a href='vscode-notebook-cell:/c%3A/Users/kerry/repos/busi520-development/docs/11-Statistics.ipynb#X54sZmlsZQ%3D%3D?line=11'>12</a> pd.DataFrame(result2.summary().tables[1]).to_excel("excelfile.xlsx") File c:\Users\kerry\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\util\_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs) 209 else: 210 kwargs[new_arg_name] = new_arg_value --> 211 return func(*args, **kwargs) File c:\Users\kerry\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\util\_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs) 209 else: 210 kwargs[new_arg_name] = new_arg_value --> 211 return func(*args, **kwargs) File c:\Users\kerry\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\generic.py:2373, in NDFrame.to_excel(self, excel_writer, sheet_name, na_rep, float_format, columns, header, index, index_label, startrow, startcol, engine, merge_cells, encoding, inf_rep, verbose, freeze_panes, storage_options) 2360 from pandas.io.formats.excel import ExcelFormatter 2362 formatter = ExcelFormatter( 2363 df, 2364 na_rep=na_rep, (...) 2371 inf_rep=inf_rep, 2372 ) -> 2373 formatter.write( 2374 excel_writer, 2375 sheet_name=sheet_name, 2376 startrow=startrow, 2377 startcol=startcol, 2378 freeze_panes=freeze_panes, 2379 engine=engine, 2380 storage_options=storage_options, 2381 ) File c:\Users\kerry\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\formats\excel.py:911, in ExcelFormatter.write(self, writer, sheet_name, startrow, startcol, freeze_panes, engine, storage_options) 907 need_save = False 908 else: 909 # error: Cannot instantiate abstract class 'ExcelWriter' with abstract 910 # attributes 'engine', 'save', 'supported_extensions' and 'write_cells' --> 911 writer = ExcelWriter( # type: ignore[abstract] 912 writer, engine=engine, storage_options=storage_options 913 ) 914 need_save = True 916 try: File c:\Users\kerry\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\excel\_openpyxl.py:60, in OpenpyxlWriter.__init__(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs) 56 from openpyxl.workbook import Workbook 58 engine_kwargs = combine_kwargs(engine_kwargs, kwargs) ---> 60 super().__init__( 61 path, 62 mode=mode, 63 storage_options=storage_options, 64 if_sheet_exists=if_sheet_exists, 65 engine_kwargs=engine_kwargs, 66 ) 68 # ExcelWriter replaced "a" by "r+" to allow us to first read the excel file from 69 # the file and later write to it 70 if "r+" in self._mode: # Load from existing workbook File c:\Users\kerry\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\excel\_base.py:1302, in ExcelWriter.__init__(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs) 1298 self._handles = IOHandles( 1299 cast(IO[bytes], path), compression={"compression": None} 1300 ) 1301 if not isinstance(path, ExcelWriter): -> 1302 self._handles = get_handle( 1303 path, mode, storage_options=storage_options, is_text=False 1304 ) 1305 self._cur_sheet = None 1307 if date_format is None: File c:\Users\kerry\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\common.py:866, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options) 857 handle = open( 858 handle, 859 ioargs.mode, (...) 862 newline="", 863 ) 864 else: 865 # Binary mode --> 866 handle = open(handle, ioargs.mode) 867 handles.append(handle) 869 # Convert BytesIO or file objects passed with an encoding PermissionError: [Errno 13] Permission denied: 'excelfile.xlsx'
# saving to tex
pystout(
models=[result1, result2],
file="texfile.tex",
endog_names = ["wage", "wage"],
exogvars=[
'female',
'educ',
'female:educ',
],
stars={0.1: "*", 0.05: "**", 0.01: "***"},
addnotes=["$^*p<0.1$, $^{**}p<0.05$, $^{***}p<0.01$"],
modstat={"nobs": "Obs", "rsquared_adj": "Adj $R^2$"},
title="Wage Equation",
label="tab:wage"
)
c:\Users\kerry\AppData\Local\Programs\Python\Python310\lib\site-packages\pystout\pystout.py:377: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. options = options.append(pd.DataFrame([r],index=[value])) c:\Users\kerry\AppData\Local\Programs\Python\Python310\lib\site-packages\pystout\pystout.py:377: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. options = options.append(pd.DataFrame([r],index=[value]))