import pandas as pd

    df = pd.DataFrame(
        {"a": [1, 2, 3], "b": [4, 5, 6]}
    )
    df.to_csv("example.csv")
    print(df)

source .venv/bin/activate
    pip install pandas matplotlib jupyter

import numpy as np 
import pandas as pd

df = pd.read_stata("WAGE1.DTA")
df.columns

Index(['wage', 'educ', 'exper', 'tenure', 'nonwhite', 'female', 'married',
       'numdep', 'smsa', 'northcen', 'south', 'west', 'construc', 'ndurman',
       'trcommpu', 'trade', 'services', 'profserv', 'profocc', 'clerocc',
       'servocc', 'lwage', 'expersq', 'tenursq'],
      dtype='object')

import statsmodels.api as sm 

# right-hand side variables
X = df[
    [
        "educ", 
        "exper", 
        "tenure", 
        "nonwhite", 
        "female", 
        "married"
    ]
]

# include an intercept
X = sm.add_constant(X)

# left-hand side variable
y = df.wage

# define the model
model = sm.OLS(exog=X, endog=y)

# fit the model
result = model.fit()

# look at results
result.summary()

# if you want only the main table 

result.summary().tables[1]

# getting the main table as a dataframe 
# we can ignore the future warning (for now!)

table_as_html = result.summary().tables[1].as_html()
table_as_df = pd.read_html(table_as_html, header=0, index_col=0)[0]
table_as_df.round(3)

C:\Users\kerry\AppData\Local\Temp\ipykernel_22164\2205508342.py:4: FutureWarning: Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
  table_as_df = pd.read_html(table_as_html, header=0, index_col=0)[0]

# save to excel (may need to pip install openpyxl)

table_as_df.to_excel("table.xlsx")

# save to latex

table_as_df.to_latex("table.tex")

# standardize regressors 

for col in ["educ", "exper", "tenure"]:
    X[col] = X[col] / X[col].std()

sm.OLS(exog=X, endog=y).fit().summary().tables[1]

# more compactly,

X[["educ", "exper", "tenure"]] /= X[["educ", "exper", "tenure"]].std()

import statsmodels.formula.api as smf

model = smf.ols(
    """ 
    wage ~ educ + np.log(exper) + tenure**2 + nonwhite 
         + female + married + educ*female - 1
    """,
    data=df
)
result = model.fit()
result.summary().tables[1]

# the covariance estimator to use when computing std errors is passed 
# as an argument to .fit() in both statsmodels flavors

help(model.fit)

Help on method fit in module statsmodels.regression.linear_model:

fit(method: "Literal['pinv', 'qr']" = 'pinv', cov_type: "Literal['nonrobust', 'fixed scale', 'HC0', 'HC1', 'HC2', 'HC3', 'HAC', 'hac-panel', 'hac-groupsum', 'cluster']" = 'nonrobust', cov_kwds=None, use_t: 'bool | None' = None, **kwargs) method of statsmodels.regression.linear_model.OLS instance
    Full fit of the model.

    The results include an estimate of covariance matrix, (whitened)
    residuals and an estimate of scale.

    Parameters
    ----------
    method : str, optional
        Can be "pinv", "qr".  "pinv" uses the Moore-Penrose pseudoinverse
        to solve the least squares problem. "qr" uses the QR
        factorization.
    cov_type : str, optional
        See `regression.linear_model.RegressionResults` for a description
        of the available covariance estimators.
    cov_kwds : list or None, optional
        See `linear_model.RegressionResults.get_robustcov_results` for a
        description required keywords for alternative covariance
        estimators.
    use_t : bool, optional
        Flag indicating to use the Student's t distribution when computing
        p-values.  Default behavior depends on cov_type. See
        `linear_model.RegressionResults.get_robustcov_results` for
        implementation details.
    **kwargs
        Additional keyword arguments that contain information used when
        constructing a model using the formula interface.

    Returns
    -------
    RegressionResults
        The model estimation results.

    See Also
    --------
    RegressionResults
        The results container.
    RegressionResults.get_robustcov_results
        A method to change the covariance estimator used when fitting the
        model.

    Notes
    -----
    The fit method uses the pseudoinverse of the design/exogenous variables
    to solve the least squares minimization.

smf.logit("female ~ wage", df).fit().summary().tables[1]

Optimization terminated successfully.
         Current function value: 0.622458
         Iterations 6

from linearmodels.panel import PanelOLS
from linearmodels.datasets import wage_panel

data = wage_panel.load()
data = data.set_index(["nr", "year"])
data.head()

model = PanelOLS.from_formula(
    "lwage ~ exper + educ + black + hisp + married + union + TimeEffects",
    data=data,
)
result = model.fit(cov_type="clustered", cluster_time=True, cluster_entity=True)
result.summary

# example regressions

model1 = smf.ols("wage ~ female", data=df)
result1 = model1.fit()

model2 = smf.ols("wage ~ female + educ + female*educ", data=df)
result2 = model2.fit()

from pystout import pystout

pystout(
    models=[result1, result2], 
    file="regressions.tex",
    endog_names = ["wage", "wage"],
    exogvars=[
        'female', 
        'educ', 
        'female:educ', 
        ],
    stars={0.1: "*", 0.05: "**", 0.01: "***"},
    addnotes=["$^*p<0.1$, $^{**}p<0.05$, $^{***}p<0.01$"],
    modstat={"nobs": "Obs", "rsquared_adj": "Adj $R^2$"},
    title="Wage Equation",
    label="tab:wage"
)

# A small data set with acc=accruals and agr=asset growth, 
# monthly data since 2010, roughly 2,000 stocks per month.

data = pd.read_csv(
    "https://www.dropbox.com/s/012c6y4gxsxss6y/ghz.csv?dl=1", 
    parse_dates=["date"]
)

# change dates to period format (to merge with FF factors)
data["date"] = data.date.dt.to_period("M")

data = data.set_index(["permno", "date"])
data = data.sort_index()
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 282598 entries, (np.float64(10025.0), Period('2012-10', 'M')) to (np.float64(93436.0), Period('2021-12', 'M'))
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ret     282598 non-null  float64
 1   acc     282598 non-null  float64
 2   agr     282598 non-null  float64
dtypes: float64(3)
memory usage: 7.7 MB

data.head()

# cross-sectional regression function

def xreg(df):
    model = smf.ols("ret ~ acc + agr", data=df)
    result = model.fit()
    return result.params

# apply to each cross-section to get time series of coefficients
regressions = data.groupby('date').apply(xreg)
regressions.head()

# We want to run t-tests on the means (the null is that they are zero).
# We probably want to use Newey-West standard errors
# One way to do that is to regress on 1 (the coefficient is the mean) and use
# Newey-West standard errors in statsmodels OLS.

for col in ["acc", "agr"]:
    model = smf.ols(f"{col} ~ 1", data=regressions)
    result = model.fit(cov_type='HAC', cov_kwds={"kernel": "bartlett", "maxlags": 12})
    print(col, result.tvalues.item())

acc 0.43832436517863066
agr -1.5746161411946056

# merge Fama-French factors (we want Mkt-RF and RF)
# this pastes the factors for each month beside each stock for that month

from pandas_datareader import DataReader as pdr

ff = pdr("F-F_Research_Data_Factors", "famafrench", start=2010)[0] / 100
data_ff = data.merge(ff, left_on='date', right_index=True, how="left")

# compute excess returns
data_ff["ret_RF"] = data_ff.ret - data_ff.RF

# we can't put Mkt-RF in the ols formula so we change to an underscore
data_ff = data_ff.rename(columns={"Mkt-RF": "Mkt_RF"})
data_ff.head()

C:\Users\kerry\AppData\Local\Temp\ipykernel_22164\4004107026.py:6: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.
  ff = pdr("F-F_Research_Data_Factors", "famafrench", start=2010)[0] / 100
C:\Users\kerry\AppData\Local\Temp\ipykernel_22164\4004107026.py:6: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.
  ff = pdr("F-F_Research_Data_Factors", "famafrench", start=2010)[0] / 100

from statsmodels.regression.rolling import RollingOLS 

def rolling_betas(df):

    # we'll pass the time series for each stock to this function
    # we first check how many months the stock is in the data
    n = df.shape[0]

    # if there are at least 24 months, we proceed
    if n >= 24:
        # data = df.set_index("date") 
        model = RollingOLS.from_formula(
            "ret_RF ~ Mkt_RF",
            window=min(n, 60),
            min_nobs=24,
            expanding=True,
            data=df
        )
        result = model.fit()

        # return the time series of rolling window betas
        return result.params['Mkt_RF'].dropna()

    # if number of months < 24, we skip this stock
    else:
        pass
    
betas = data_ff.groupby("permno", group_keys=False).apply(rolling_betas)
betas.dropna().head()

permno   date   
10026.0  2011-12    0.635232
         2012-01    0.582387
         2012-02    0.557631
         2012-03    0.565231
         2012-04    0.545091
dtype: float64

Dep. Variable:	wage	R-squared:	0.368
Model:	OLS	Adj. R-squared:	0.361
Method:	Least Squares	F-statistic:	50.41
Date:	Wed, 11 Sep 2024	Prob (F-statistic):	8.27e-49
Time:	07:52:53	Log-Likelihood:	-1312.3
No. Observations:	526	AIC:	2639.
Df Residuals:	519	BIC:	2668.
Df Model:	6
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-1.6022	0.731	-2.192	0.029	-3.038	-0.166
educ	0.5551	0.050	11.090	0.000	0.457	0.653
exper	0.0187	0.012	1.557	0.120	-0.005	0.042
tenure	0.1388	0.021	6.562	0.000	0.097	0.180
nonwhite	-0.0658	0.427	-0.154	0.877	-0.904	0.772
female	-1.7424	0.267	-6.530	0.000	-2.267	-1.218
married	0.5566	0.287	1.941	0.053	-0.007	1.120

Omnibus:	188.311	Durbin-Watson:	1.795
Prob(Omnibus):	0.000	Jarque-Bera (JB):	728.036
Skew:	1.611	Prob(JB):	8.11e-159
Kurtosis:	7.779	Cond. No.	143.

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-1.6022	0.731	-2.192	0.029	-3.038	-0.166
educ	0.5551	0.050	11.090	0.000	0.457	0.653
exper	0.0187	0.012	1.557	0.120	-0.005	0.042
tenure	0.1388	0.021	6.562	0.000	0.097	0.180
nonwhite	-0.0658	0.427	-0.154	0.877	-0.904	0.772
female	-1.7424	0.267	-6.530	0.000	-2.267	-1.218
married	0.5566	0.287	1.941	0.053	-0.007	1.120

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-1.602	0.731	-2.192	0.029	-3.038	-0.166
educ	0.555	0.050	11.090	0.000	0.457	0.653
exper	0.019	0.012	1.557	0.120	-0.005	0.042
tenure	0.139	0.021	6.562	0.000	0.097	0.180
nonwhite	-0.066	0.427	-0.154	0.877	-0.904	0.772
female	-1.742	0.267	-6.530	0.000	-2.267	-1.218
married	0.557	0.287	1.941	0.053	-0.007	1.120

Day 5¶

BUSI 520: Python for Business Research¶

Kerry Back, JGSB, Rice University¶

Today's plan¶

1. Scripts, environments, and jgsrc1¶

Python scripts¶

Why scripts?¶

Example¶

Running at the command line¶

Exporting notebooks to scripts¶

Running code on jgsrc1¶

Using VS Code on jgsrc1¶

Virtual environments¶

Using tmux on linux¶

2. Regression¶

Lecture plan¶

Old-fashioned statsmodels¶

R-like statsmodels¶

Logit¶

Linearmodels¶

Indexing for linearmodels data¶

pystout¶

Fama-MacBeth¶

Rolling Window Betas¶

	coef	std err	z	P>\|z\|	[0.025	0.975]
Intercept	1.3893	0.215	6.458	0.000	0.968	1.811
wage	-0.2654	0.037	-7.096	0.000	-0.339	-0.192

		black	exper	hisp	hours	married	educ	union	lwage	expersq	occupation
nr	year
13	1980	0	1	0	2672	0	14	0	1.197540	1	9
	1981	0	2	0	2320	0	14	1	1.853060	4	9
	1982	0	3	0	2940	0	14	0	1.344462	9	9
	1983	0	4	0	2960	0	14	0	1.433213	16	9
	1984	0	5	0	3071	0	14	0	1.568125	25	5

Dep. Variable:	lwage	R-squared:	0.1216
Estimator:	PanelOLS	R-squared (Between):	0.9312
No. Observations:	4360	R-squared (Within):	0.1305
Date:	Tue, Sep 10 2024	R-squared (Overall):	0.8962
Time:	16:23:01	Log-likelihood	-2986.3
Cov. Estimator:	Clustered
		F-statistic:	100.30
Entities:	545	P-value	0.0000
Avg Obs:	8.0000	Distribution:	F(6,4346)
Min Obs:	8.0000
Max Obs:	8.0000	F-statistic (robust):	36.580
		P-value	0.0000
Time periods:	8	Distribution:	F(6,4346)
Avg Obs:	545.00
Min Obs:	545.00
Max Obs:	545.00

	Parameter	Std. Err.	T-stat	P-value	Lower CI	Upper CI
exper	0.0303	0.0133	2.2750	0.0230	0.0042	0.0565
educ	0.0929	0.0100	9.3247	0.0000	0.0733	0.1124
black	-0.1373	0.0478	-2.8760	0.0040	-0.2310	-0.0437
hisp	0.0137	0.0350	0.3906	0.6961	-0.0550	0.0823
married	0.1110	0.0219	5.0604	0.0000	0.0680	0.1540
union	0.1863	0.0269	6.9147	0.0000	0.1335	0.2392

		ret	acc	agr
permno	date
10025.0	2012-10	0.055125	-0.028995	0.184931
	2012-11	-0.053652	-0.028995	0.184931
	2012-12	-0.020992	-0.028995	0.184931
	2013-01	0.086949	-0.028995	0.184931
	2013-02	0.095527	-0.028995	0.184931

	Intercept	acc	agr
date
2010-01	-0.029367	0.021650	-0.011795
2010-02	0.040089	-0.052454	-0.003666
2010-03	0.081821	0.037491	-0.024873
2010-04	0.042478	-0.090209	-0.009522
2010-05	-0.074971	0.008434	0.005427

	coef	std err	t	P>\|t\|	[0.025	0.975]
educ	0.4237	0.024	17.734	0.000	0.377	0.471
np.log(exper)	0.3510	0.139	2.521	0.012	0.077	0.625
tenure	0.1247	0.021	6.072	0.000	0.084	0.165
nonwhite	-0.2982	0.424	-0.703	0.482	-1.131	0.535
female	-2.6244	1.015	-2.585	0.010	-4.619	-0.630
married	0.2517	0.303	0.830	0.407	-0.344	0.848
educ:female	0.0503	0.081	0.620	0.536	-0.109	0.210