import pandas as pd
# Creating a Series using a dictionary
data_series = pd.Series({'a': 1, 'b': 2, 'c': 3})
data_series

a    1
b    2
c    3
dtype: int64

# Creating a DataFrame using a dictionary
data_frame = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'Occupation': ['Engineer', 'Doctor', 'Teacher']
})
data_frame

import numpy as np
np.random.seed(0)
# Generating a 5x3 array of random integers between 0 and 100
random_data = np.random.randint(0, 100, size=(5, 3))
# Creating a DataFrame from the random data
random_df = pd.DataFrame(random_data, columns=['Column1', 'Column2', 'Column3'])
random_df

import seaborn as sns
# Loading the tips dataset
tips = sns.load_dataset('tips')
tips.head()

# Using [] (basic indexing)
single_column = random_df['Column1']
multiple_columns = random_df[['Column1', 'Column3']]
series_value = data_series['a']
single_column, multiple_columns, series_value

(0    44
 1    67
 2    83
 3    87
 4    88
 Name: Column1, dtype: int32,
    Column1  Column3
 0       44       64
 1       67        9
 2       83       36
 3       87       88
 4       88       58,
 1)

# Using .loc (label-based indexing)
row_1_col_1 = random_df.loc[0, 'Column1']
single_row = random_df.loc[1, :]
multiple_rows_cols = random_df.loc[[0, 2], ['Column1', 'Column3']]
series_value_loc = data_series.loc['b']

row_1_col_1, single_row, multiple_rows_cols, series_value_loc

(44,
 Column1    67
 Column2    67
 Column3     9
 Name: 1, dtype: int32,
    Column1  Column3
 0       44       64
 2       83       36,
 2)

# Using .iloc (integer-location based indexing)
row_2_col_2 = random_df.iloc[1, 1]
single_row_iloc = random_df.iloc[2, :]
multiple_rows_cols_iloc = random_df.iloc[[1, 3], [0, 2]]
series_value_iloc = data_series.iloc[1]

row_2_col_2, single_row_iloc, multiple_rows_cols_iloc, series_value_iloc

(67,
 Column1    83
 Column2    21
 Column3    36
 Name: 2, dtype: int32,
    Column1  Column3
 1       67        9
 3       87       88,
 2)

# Filtering rows where Column1 values are greater than 50
condition_1 = random_df['Column1'] > 50
filtered_df_1 = random_df[condition_1]
filtered_df_1

# Filtering rows where Column2 values are less than 30 or Column3 values are greater than 70
condition_2 = (random_df['Column2'] < 30) | (random_df['Column3'] > 70)
filtered_df_2 = random_df[condition_2]
filtered_df_2

# Filtering rows where Column1 values are not equal to 50
condition_3 = ~(random_df['Column1'] == 50)
filtered_df_3 = random_df[condition_3]
filtered_df_3

# Unary Operation: Negating the values in Column1
random_df['Negated_Column1'] = -random_df['Column1']
# Binary Operation: Adding the values of Column1 and Column2
random_df['Sum_Column1_Column2'] = random_df['Column1'] + random_df['Column2']
# Using np.where: Creating a new column based on a condition
random_df['Column1_Positive'] = np.where(random_df['Column1'] > 50, 'Yes', 'No')
random_df

import numpy as np
# Creating a sample DataFrame with missing values
data = {
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, 7, 8],
    'C': [9, 10, 11, np.nan]
}
df_missing = pd.DataFrame(data)
df_missing

# Using dropna to drop rows with missing values
df_dropped = df_missing.dropna()
# Using fillna to fill missing values with 0
df_filled = df_missing.fillna(0)
df_dropped, df_filled

(     A    B    C
 0  1.0  5.0  9.0,
      A    B     C
 0  1.0  5.0   9.0
 1  2.0  0.0  10.0
 2  0.0  7.0  11.0
 3  4.0  8.0   0.0)

# Creating a sample DataFrame for demonstration
data = {
    'A': [10, 20, 30, 40],
    'B': [5, 15, 25, 35],
    'C': [1, 2, 3, 4]
}
df_agg = pd.DataFrame(data)
df_agg

# Column-wise Aggregation
column_sum = df_agg.sum()
column_mean = df_agg.mean()
column_sum, column_mean

(A    100
 B     80
 C     10
 dtype: int64,
 A    25.0
 B    20.0
 C     2.5
 dtype: float64)

# More column-wise aggregation
column_max = df_agg.max()
column_min = df_agg.min()
column_max, column_min

(A    40
 B    35
 C     4
 dtype: int64,
 A    10
 B     5
 C     1
 dtype: int64)

# Row-wise Aggregation
row_sum = df_agg.sum(axis=1)
row_sum

0    16
1    37
2    58
3    79
dtype: int64

# Creating a sample DataFrame for demonstration
data = {
    'Year': [2020, 2020, 2021, 2021],
    'Month': ['Jan', 'Feb', 'Jan', 'Feb'],
    'Value': [100, 150, 110, 160]
}
df_index = pd.DataFrame(data)
df_index

# Setting the 'Year' column as the index
df_year_index = df_index.set_index('Year')
# Resetting the index
df_reset = df_year_index.reset_index()

df_year_index, df_reset

(     Month  Value
 Year             
 2020   Jan    100
 2020   Feb    150
 2021   Jan    110
 2021   Feb    160,
    Year Month  Value
 0  2020   Jan    100
 1  2020   Feb    150
 2  2021   Jan    110
 3  2021   Feb    160)

# Setting a multi-index using 'Year' and 'Month'
df_multi_index = df_index.set_index(['Year', 'Month'])
df_multi_index

import seaborn as sns
# Loading the tips dataset from seaborn
tips = sns.load_dataset('tips')
tips.head()

# Grouping by a Single Column: Average total bill by day
avg_bill_by_day = tips.groupby('day')['total_bill'].mean()

avg_bill_by_day

day
Thur    17.682742
Fri     17.151579
Sat     20.441379
Sun     21.410000
Name: total_bill, dtype: float64

# Grouping by Multiple Columns: Total tip amount by day and time
total_tip_by_day_time = tips.groupby(['day', 'time'])['tip'].sum()

total_tip_by_day_time

day   time  
Thur  Lunch     168.83
      Dinner      3.00
Fri   Lunch      16.68
      Dinner     35.28
Sat   Lunch       0.00
      Dinner    260.40
Sun   Lunch       0.00
      Dinner    247.39
Name: tip, dtype: float64

# Applying Multiple Aggregation Functions: Mean, sum, and max of total_bill by day
multiple_aggregations = tips.groupby('day')['total_bill'].agg(['mean', 'sum', 'max'])
multiple_aggregations

# using unstack to convert to `wide form'

wide1 = total_tip_by_day_time.unstack()
wide1

wide2 = total_tip_by_day_time.unstack('day')
wide2

# using stack to convert to 'long form'

long = wide1.stack()
long

day   time  
Thur  Lunch     168.83
      Dinner      3.00
Fri   Lunch      16.68
      Dinner     35.28
Sat   Lunch       0.00
      Dinner    260.40
Sun   Lunch       0.00
      Dinner    247.39
dtype: float64

# extracting a row of a multi-indexed dataframe

long.loc[('Fri', 'Dinner')]

35.28

# Sample DataFrames for demonstration
# DataFrames for concatenate
df1 = pd.DataFrame({'A': ['A0', 'A1'], 'B': ['B0', 'B1']})
df2 = pd.DataFrame({'A': ['A2', 'A3'], 'B': ['B2', 'B3']})
# DataFrames for join
df3 = pd.DataFrame({'C': ['C0', 'C1', 'C2'], 'D': ['D0', 'D1', 'D2']}, index=['X', 'Y', 'Z'])
df4 = pd.DataFrame({'E': ['E0', 'E1', 'E2'], 'F': ['F0', 'F1', 'F2']}, index=['X', 'Y', 'Z'])
# DataFrames for merge
df5 = pd.DataFrame({'key': ['K0', 'K1', 'K2'], 'G': ['G0', 'G1', 'G2']})
df6 = pd.DataFrame({'key': ['K0', 'K1', 'K2'], 'H': ['H0', 'H1', 'H2']})

# concatenating vertically
concatenated_df_v = pd.concat([df1, df2])
df1, df2, concatenated_df_v

(    A   B
 0  A0  B0
 1  A1  B1,
     A   B
 0  A2  B2
 1  A3  B3,
     A   B
 0  A0  B0
 1  A1  B1
 0  A2  B2
 1  A3  B3)

# concatenating horizontally
concatenated_df_h = pd.concat([df3, df4], axis=1)
df3, df4, concatenated_df_h

(    C   D
 X  C0  D0
 Y  C1  D1
 Z  C2  D2,
     E   F
 X  E0  F0
 Y  E1  F1
 Z  E2  F2,
     C   D   E   F
 X  C0  D0  E0  F0
 Y  C1  D1  E1  F1
 Z  C2  D2  E2  F2)

# Using join to join df3 and df4 based on their indices
joined_df = df3.join(df4)
joined_df

# Using merge to merge df5 and df6 based on the 'key' column
merged_df = pd.merge(df5, df6, on='key')
merged_df

# Sample DataFrames for demonstration
df_A = pd.DataFrame({'key': ['K0', 'K1', 'K2'], 'value_A': ['A0', 'A1', 'A2']})
df_B = pd.DataFrame({'key': ['K1', 'K2', 'K3'], 'value_B': ['B1', 'B2', 'B3']})
df_A, df_B

(  key value_A
 0  K0      A0
 1  K1      A1
 2  K2      A2,
   key value_B
 0  K1      B1
 1  K2      B2
 2  K3      B3)

# Performing an inner merge on df_A and df_B based on the 'key' column
inner_merged = pd.merge(df_A, df_B, on='key', how='inner')
inner_merged

# Performing an outer merge on df_A and df_B based on the 'key' column
outer_merged = pd.merge(df_A, df_B, on='key', how='outer')
outer_merged

# Sample Series for demonstration
names = pd.Series(['Alice', 'Bob', 'Charlie'])
# Using map to prefix each name with 'Hello, '
greetings = names.map(lambda x: 'Hello, ' + x)
greetings

0      Hello, Alice
1        Hello, Bob
2    Hello, Charlie
dtype: object

# Sample Series and DataFrame for demonstration
numbers = pd.Series([1, 2, 3, 4])
df_numbers = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
# Using apply with a Series to square each number
squared_numbers = numbers.apply(lambda x: x**2)
# Using apply with a DataFrame to sum values along the columns
column_sums = df_numbers.apply(sum, axis=0)
squared_numbers, column_sums

(0     1
 1     4
 2     9
 3    16
 dtype: int64,
 A     6
 B    15
 dtype: int64)

# Using the tips dataset for demonstration
import seaborn as sns
tips = sns.load_dataset('tips')
# Grouping by 'day' and using transform to compute mean tip for each day
mean_tips_by_day = tips.groupby('day')['tip'].transform('mean')
tips['mean_tip_by_day'] = mean_tips_by_day
tips.head()

# Using describe method on the tips dataset
tips.describe()

# Using info method on the tips dataset
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   total_bill       244 non-null    float64 
 1   tip              244 non-null    float64 
 2   sex              244 non-null    category
 3   smoker           244 non-null    category
 4   day              244 non-null    category
 5   time             244 non-null    category
 6   size             244 non-null    int64   
 7   mean_tip_by_day  244 non-null    float64 
dtypes: category(4), float64(3), int64(1)
memory usage: 9.3 KB

# Using head method on the tips dataset to view the first 3 rows
tips.head(3)

# Using tail method on the tips dataset to view the last 3 rows
tips.tail(3)

from pandas.io.stata import StataReader

stata_file = StataReader('WAGE1.dta')
variables = stata_file.variable_labels()
data = stata_file.read()

variables

{'wage': 'average hourly earnings',
 'educ': 'years of education',
 'exper': 'years potential experience',
 'tenure': 'years with current employer',
 'nonwhite': '=1 if nonwhite',
 'female': '=1 if female',
 'married': '=1 if married',
 'numdep': 'number of dependents',
 'smsa': '=1 if live in SMSA',
 'northcen': '=1 if live in north central U.S',
 'south': '=1 if live in southern region',
 'west': '=1 if live in western region',
 'construc': '=1 if work in construc. indus.',
 'ndurman': '=1 if in nondur. manuf. indus.',
 'trcommpu': '=1 if in trans, commun, pub ut',
 'trade': '=1 if in wholesale or retail',
 'services': '=1 if in services indus.',
 'profserv': '=1 if in prof. serv. indus.',
 'profocc': '=1 if in profess. occupation',
 'clerocc': '=1 if in clerical occupation',
 'servocc': '=1 if in service occupation',
 'lwage': 'log(wage)',
 'expersq': 'exper^2',
 'tenursq': 'tenure^2'}

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 526 entries, 0 to 525
Data columns (total 24 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   wage      526 non-null    float32
 1   educ      526 non-null    int8   
 2   exper     526 non-null    int8   
 3   tenure    526 non-null    int8   
 4   nonwhite  526 non-null    int8   
 5   female    526 non-null    int8   
 6   married   526 non-null    int8   
 7   numdep    526 non-null    int8   
 8   smsa      526 non-null    int8   
 9   northcen  526 non-null    int8   
 10  south     526 non-null    int8   
 11  west      526 non-null    int8   
 12  construc  526 non-null    int8   
 13  ndurman   526 non-null    int8   
 14  trcommpu  526 non-null    int8   
 15  trade     526 non-null    int8   
 16  services  526 non-null    int8   
 17  profserv  526 non-null    int8   
 18  profocc   526 non-null    int8   
 19  clerocc   526 non-null    int8   
 20  servocc   526 non-null    int8   
 21  lwage     526 non-null    float32
 22  expersq   526 non-null    int16  
 23  tenursq   526 non-null    int16  
dtypes: float32(2), int16(2), int8(20)
memory usage: 20.5 KB

# write to a csv file

data.to_csv("wage1.csv")

# read a csv file

data2 = pd.read_csv("wage1.csv")
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 526 entries, 0 to 525
Data columns (total 25 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  526 non-null    int64  
 1   wage        526 non-null    float64
 2   educ        526 non-null    int64  
 3   exper       526 non-null    int64  
 4   tenure      526 non-null    int64  
 5   nonwhite    526 non-null    int64  
 6   female      526 non-null    int64  
 7   married     526 non-null    int64  
 8   numdep      526 non-null    int64  
 9   smsa        526 non-null    int64  
 10  northcen    526 non-null    int64  
 11  south       526 non-null    int64  
 12  west        526 non-null    int64  
 13  construc    526 non-null    int64  
 14  ndurman     526 non-null    int64  
 15  trcommpu    526 non-null    int64  
 16  trade       526 non-null    int64  
 17  services    526 non-null    int64  
 18  profserv    526 non-null    int64  
 19  profocc     526 non-null    int64  
 20  clerocc     526 non-null    int64  
 21  servocc     526 non-null    int64  
 22  lwage       526 non-null    float64
 23  expersq     526 non-null    int64  
 24  tenursq     526 non-null    int64  
dtypes: float64(2), int64(23)
memory usage: 102.9 KB

	total_bill	tip	size	mean_tip_by_day
count	244.000000	244.000000	244.000000	244.000000
mean	19.785943	2.998279	2.569672	2.998279
std	8.902412	1.383638	0.951100	0.197993
min	3.070000	1.000000	1.000000	2.734737
25%	13.347500	2.000000	2.000000	2.771452
50%	17.795000	2.900000	2.000000	2.993103
75%	24.127500	3.562500	3.000000	3.255132
max	50.810000	10.000000	6.000000	3.255132

Introduction to Pandas¶

BUSI 520: Python for Business Research¶

Kerry Back, JGSB, Rice University¶

What is Pandas?¶

Creating Series and DataFrame using dictionaries¶

Creating a test dataframe with random numbers¶

Exploring the seaborn tips dataset¶

Extracting Columns and Rows from Series and DataFrames¶

Filtering Rows Based on Logical Conditions¶

Element-wise Unary and Binary Operations on Columns¶

Handling Missing Data: `dropna` and `fillna`¶

Aggregation on Columns and Rows¶

Indexing in Pandas¶

Aggregating by Groups¶

Stack and Unstack Methods in Pandas¶

Combining DataFrames: Concatenate, Join, and Merge¶

Inner and Outer Joins/Merges¶

Map, Apply, and Transform Methods in Pandas¶

Exploratory Methods in Pandas: describe, info, head, and tail¶

Reading and Writing DataFrames¶

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	Column1	Column2	Column3	Negated_Column1	Sum_Column1_Column2	Column1_Positive
0	44	47	64	-44	91	No
1	67	67	9	-67	134	Yes
2	83	21	36	-83	104	Yes
3	87	70	88	-87	157	Yes
4	88	12	58	-88	100	Yes

	mean	sum	max
day
Thur	17.682742	1096.33	43.11
Fri	17.151579	325.88	40.17
Sat	20.441379	1778.40	50.81
Sun	21.410000	1627.16	48.17

	total_bill	tip	sex	smoker	day	time	size	mean_tip_by_day
241	22.67	2.00	Male	Yes	Sat	Dinner	2	2.993103
242	17.82	1.75	Male	No	Sat	Dinner	2	2.993103
243	18.78	3.00	Female	No	Thur	Dinner	2	2.771452

	Name	Age	Occupation
0	Alice	25	Engineer
1	Bob	30	Doctor
2	Charlie	35	Teacher

	Column1	Column2	Column3	Negated_Column1	Sum_Column1_Column2	Column1_Positive
0	44	47	64	-44	91	No
1	67	67	9	-67	134	Yes
2	83	21	36	-83	104	Yes
3	87	70	88	-87	157	Yes
4	88	12	58	-88	100	Yes

Introduction to Pandas¶

BUSI 520: Python for Business Research¶

Kerry Back, JGSB, Rice University¶

What is Pandas?¶

Creating Series and DataFrame using dictionaries¶

Creating a test dataframe with random numbers¶

Exploring the seaborn tips dataset¶

Extracting Columns and Rows from Series and DataFrames¶

Filtering Rows Based on Logical Conditions¶

Element-wise Unary and Binary Operations on Columns¶

Handling Missing Data: dropna and fillna¶

Aggregation on Columns and Rows¶

Indexing in Pandas¶

Aggregating by Groups¶

Stack and Unstack Methods in Pandas¶

Combining DataFrames: Concatenate, Join, and Merge¶

Inner and Outer Joins/Merges¶

Map, Apply, and Transform Methods in Pandas¶

Exploratory Methods in Pandas: describe, info, head, and tail¶

Reading and Writing DataFrames¶

Handling Missing Data: `dropna` and `fillna`¶

	Column1	Column2	Column3	Negated_Column1	Sum_Column1_Column2	Column1_Positive
0	44	47	64	-44	91	No
1	67	67	9	-67	134	Yes
2	83	21	36	-83	104	Yes
3	87	70	88	-87	157	Yes
4	88	12	58	-88	100	Yes