import pandas as pd 
import requests
from bs4 import BeautifulSoup

# URL of the Wikipedia page containing the list of S&P 500 constituents
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

# Fetch the page
response = requests.get(url)
response.raise_for_status()  # Ensure the request was successful

# Parse the HTML
soup = BeautifulSoup(response.content, 'html.parser')

# Find the table containing the S&P 500 constituents
table = soup.find('table', {'id': 'constituents'})

# Extract the data into a DataFrame
# pd.read_html(str(table)) returns a list with a single element
df = pd.read_html(str(table))[0]

df.head()

C:\Users\kerry\AppData\Local\Temp\ipykernel_18828\2851306430.py:20: FutureWarning: Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
  df = pd.read_html(str(table))[0]

response.content
table 
type(table)
str(table)
pd.read_html(str(table))

import pandas as pd
import requests
import io

# URL of the CSV file
data_url = 'https://cdn.finra.org/equity/otcmarket/biweekly/shrt20240830.csv'

# Send a GET request to the URL
response = requests.get(data_url)

# Check if the request was successful
if response.status_code == 200:
    # Read the content of the response into a pandas DataFrame
    df = pd.read_csv(io.StringIO(response.text), sep='|')
    
    # Display the first few rows of the DataFrame
    print(df.head())
    
    # Display information about the DataFrame
    print("\
DataFrame Info:")
    df.info()
else:
    print(f"Failed to retrieve the data. Status code: {response.status_code}")
    print(f"Response content: {response.text[:500]}...")  # Print first 500 characters of the response

   accountingYearMonthNumber symbolCode                       issueName  \
0                   20240830          A       Agilent Technologies Inc.   
1                   20240830         AA               Alcoa Corporation   
2                   20240830        AAA  Alternative Access First Prior   
3                   20240830      AAALY  Aareal Bank AG Unsponsored Ame   
4                   20240830       AAAU  Goldman Sachs Physical Gold ET   

  issuerServicesGroupExchangeCode marketClassCode  \
0                               A            NYSE   
1                               A            NYSE   
2                               E            ARCA   
3                               S             OTC   
4                               H             BZX   

   currentShortPositionQuantity  previousShortPositionQuantity stockSplitFlag  \
0                       3160001                        2855618            NaN   
1                      11370782                       13010958            NaN   
2                         25705                          25920            NaN   
3                             2                              2            NaN   
4                        741002                         749049            NaN   

   averageDailyVolumeQuantity  daysToCoverQuantity revisionFlag  \
0                     1550973                 2.04          NaN   
1                     4758982                 2.39          NaN   
2                        6139                 4.19          NaN   
3                           0               999.99          NaN   
4                     1908691                 1.00          NaN   

   changePercent  changePreviousNumber settlementDate  
0          10.66                304383     2024-08-30  
1         -12.61              -1640176     2024-08-30  
2          -0.83                  -215     2024-08-30  
3           0.00                     0     2024-08-30  
4          -1.07                 -8047     2024-08-30  
DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19514 entries, 0 to 19513
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   accountingYearMonthNumber        19514 non-null  int64  
 1   symbolCode                       19513 non-null  object 
 2   issueName                        19514 non-null  object 
 3   issuerServicesGroupExchangeCode  19514 non-null  object 
 4   marketClassCode                  19514 non-null  object 
 5   currentShortPositionQuantity     19514 non-null  int64  
 6   previousShortPositionQuantity    19514 non-null  int64  
 7   stockSplitFlag                   42 non-null     object 
 8   averageDailyVolumeQuantity       19514 non-null  int64  
 9   daysToCoverQuantity              19514 non-null  float64
 10  revisionFlag                     1 non-null      object 
 11  changePercent                    19514 non-null  float64
 12  changePreviousNumber             19514 non-null  int64  
 13  settlementDate                   19514 non-null  object 
dtypes: float64(2), int64(5), object(7)
memory usage: 2.1+ MB

# Method A: small example 

start = "2024-06-01" 
end = "2024-09-23" 
dates = pd.date_range(start, end)
dates = dates.strftime("%Y%m%d")
urls = [f"https://cdn.finra.org/equity/otcmarket/biweekly/shrt{date}.csv" for date in dates]

df = None 
good_dates = []
for date in dates:
    url = f"https://cdn.finra.org/equity/otcmarket/biweekly/shrt{date}.csv"
    response = requests.get(url)
    if response.status_code == 200:
        new_df = pd.read_csv(io.StringIO(response.text), sep='|', skiprows=1, header=None)
        df = pd.concat((df, new_df))
        good_dates.append(date)
        print(date)

date = good_dates[-1]
url = f"https://cdn.finra.org/equity/otcmarket/biweekly/shrt{date}.csv"
response = requests.get(url)
new_df = pd.read_csv(io.StringIO(response.text), sep='|')
df.columns = new_df.columns

20240614
20240628
20240715
20240731
20240815
20240830

df.head()

# Find the drop-down menus using selenium 

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set up the WebDriver (e.g., Chrome)
driver = webdriver.Chrome()

# Navigate to the desired webpage
driver.get('https://www.finra.org/finra-data/browse-catalog/equity-short-interest/files')

# Wait until the page is loaded and the drop-down menus are present
wait = WebDriverWait(driver, 10)
drop_downs = wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'select')))

# Print the drop-down menus
for drop_down in drop_downs:
    print(drop_down.get_attribute('outerHTML'))

# Close the WebDriver
driver.quit()

<select data-drupal-selector="edit-custom-month-month" id="edit-custom-month-month" name="custom_month[month]" class="form-select form-control" data-once="chosen" style="display:visible; position:absolute; width:0px; height: 0px; clip:rect(0,0,0,0)" tabindex="-1"><option value="any">- Any -</option><option value="01">January</option><option value="02">February</option><option value="03">March</option><option value="04">April</option><option value="05">May</option><option value="06">June</option><option value="07">July</option><option value="08" selected="selected">August</option><option value="09">September</option><option value="10">October</option><option value="11">November</option><option value="12">December</option></select>
<select data-drupal-selector="edit-custom-year-year" id="edit-custom-year-year" name="custom_year[year]" class="form-select form-control" data-once="chosen" style="display:visible; position:absolute; width:0px; height: 0px; clip:rect(0,0,0,0)" tabindex="-1"><option value="any">- Any -</option><option value="0" selected="selected">2024</option><option value="1">2023</option><option value="2">2022</option><option value="3">2021</option><option value="4">2020</option><option value="5">2019</option><option value="6">2018</option><option value="7">2017</option><option value="8">2016</option></select>

# Try to select month 1 (February?) from the drop-down menu for months and print the csv file links

from selenium.webdriver.support.ui import Select

# Set up the WebDriver (e.g., Chrome)
driver = webdriver.Chrome()

# Navigate to the desired webpage
driver.get('https://www.finra.org/finra-data/browse-catalog/equity-short-interest/files')

# Wait until the drop-down menu is present
wait = WebDriverWait(driver, 10)
drop_down = wait.until(EC.presence_of_element_located((By.ID, 'edit-custom-month-month')))

# Create a Select object
select = Select(drop_down)

# Select an item by value
select.select_by_index(1)

# Wait for the page to update after selecting the drop-down item
# You can wait for a specific element that appears only after the page update
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='view-content']")))

# Locate the links to CSV files on the updated page
csv_links = driver.find_elements(By.XPATH, "//a[contains(@href, '.csv')]")

# Print the links to CSV files
for link in csv_links:
    print(link.get_attribute('href'))

# Close the WebDriver
driver.quit()

	Symbol	Security	GICS Sector	GICS Sub-Industry	Headquarters Location	Date added	CIK	Founded
0	MMM	3M	Industrials	Industrial Conglomerates	Saint Paul, Minnesota	1957-03-04	66740	1902
1	AOS	A. O. Smith	Industrials	Building Products	Milwaukee, Wisconsin	2017-07-26	91142	1916
2	ABT	Abbott Laboratories	Health Care	Health Care Equipment	North Chicago, Illinois	1957-03-04	1800	1888
3	ABBV	AbbVie	Health Care	Biotechnology	North Chicago, Illinois	2012-12-31	1551152	2013 (1888)
4	ACN	Accenture	Information Technology	IT Consulting & Other Services	Dublin, Ireland	2011-07-06	1467373	1989

Day 8¶

BUSI 520: Python for Business Research¶

Kerry Back, JGSB, Rice University¶

Example 1: scraping a table from a web page¶

Example 2: getting csv files from a website¶

Step 1: Find the URLs¶

Step 2: Read a single file¶

Step 3: Download all files¶

Method B: Find the dates by scraping the page¶

Maybe ask Dejean?¶

	accountingYearMonthNumber	symbolCode	issueName	issuerServicesGroupExchangeCode	marketClassCode	currentShortPositionQuantity	previousShortPositionQuantity	stockSplitFlag	averageDailyVolumeQuantity	daysToCoverQuantity	revisionFlag	changePercent	changePreviousNumber	settlementDate
0	20240614	A	Agilent Technologies Inc.	A	NYSE	2874792	4423968	NaN	2422474	1.19	NaN	-35.02	-1549176	2024-06-14
1	20240614	AA	Alcoa Corporation	A	NYSE	12774970	12496353	NaN	3824253	3.34	NaN	2.23	278617	2024-06-14
2	20240614	AAA	Alternative Access First Prior	E	ARCA	21440	27895	NaN	9855	2.18	NaN	-23.14	-6455	2024-06-14
3	20240614	AAALY	Aareal Bank AG Unsponsored Ame	S	OTC	2	1	NaN	0	999.99	NaN	100.00	1	2024-06-14
4	20240614	AAAU	Goldman Sachs Physical Gold ET	H	BZX	304291	674000	NaN	2212406	1.00	NaN	-54.85	-369709	2024-06-14