我需要在SEC Edgar数据库(大约3500家公司)中下载所有8-K文件,有人知道如何使用软件或代码吗?
我尝试了sec-edgar-downloader (https://pypi.org/project/sec-edgar-downloader),它是一个非常好的软件,但它只允许我下载一个公司的8-K文件。
我也有这段代码,但我不做编程,所以我不太理解它,这段代码能按我的要求做吗,以及如何使用?
提前谢谢你。
import pandas as pd
import gc
import glob
import datetime
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import os, csv, time
from bs4 import BeautifulSoup as bs
import re
import sys
#import edgar # you only need this and the next in the first time you download the index #edgar.download_index(path_sec, 2000) # ... where '2000' is the first year of the period from which you want the data
# This function provides a connection object that is more efficient def requests_retry_session(
retries=3,
backoff_factor=0.3,
status_forcelist=(500, 502, 503, 504),
session=None,):
if __name__ == '__main__':
pass
import requests
from requests.adapters import HTTPAdapter
#from requests.packages.urllib3.util.retry import Retry
from urllib3.util.retry import Retry
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def creates_df(tsv_folder,file_type,st_year=2009,lst_year=datetime.datetime.today().year):
''' This function creates a file with the SEC urls necessary for your work.
Start date must be in the YYYY format. Default is 2009. Default end_year is today\'s year.
tsv_folder is the place where your TSV files are, the full path.
file_type is the SEC file type you want to get, e.g., 8-K or DEFM14A, always between quotes.
Destination folder for the output CSV file is your current directory.'''
if __name__ == '__main__':
pass
last_year = lst_year
path_edgar = tsv_folder
typefile = file_type
start_year = st_year
destination = os.getcwd()
print(f'Saving files to {destination}.')
list_files = []
write_cols = True
for file in glob.glob(path_edgar + '*.tsv'):
if int(file[-13:-9]) >= int(start_year) and int(file[-13:-9]) <= int(last_year):
list_files.append(file)
for file_sec in list_files:
try:
print(f'Trying to read {file_sec}.')
x = pd.read_csv(file_sec, sep='|',dtype=str,names=['cik', 'firm_name','file_type','report_date','file_url_txt','file_url_html'])
print('Done. Processing...')
x = x[x['file_type'] == typefile]
for i,j in x.iterrows():
if len(j[0]) < 10:
x.loc[i,'cik'] = '0' * (10 - len(j[0])) + str(j[0])
print('Writing...')
x.to_csv(destination+'/sec_dataframe.csv',header = write_cols, mode='a',index=False)
write_cols = False
except Exception as ex:
print('Can\'t read this file: ' + str(file_sec))
print('Python returned this message: '+str(type(ex).__name__),str(ex.args)+'.')
def id_8k(path_to_file,item8k):
'''This function identifies the 8-K filing that have the respective wanted item.
It assumes you have a csv file extracted from the function creates_df. You need to
provide the path to this file as first parameter and the 8-K item as second parameter.
The function then reads 100,000 rows at a time from the file and processes the results.'''
if __name__ == '__main__':
pass
for chunk in pd.read_csv(path_to_file,chunksize=100000,dtype=str,parse_dates=['report_date']):
for row,col in chunk.assign(
keep=[1 if dt.date().year >= 2019 else 0 for dt in chunk.report_date]).query("keep == 1").iterrows():
try:
r = requests_retry_session().get('https://www.sec.gov/Archives/' + col['file_url_html'])
except:
print(str(type(ex).__name__),str(ex.args))
with open(os.getcwd()+'/'+'errors.csv','a') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([str(col['file_url_html']),str(type(ex).__name__),str(ex.args)])
continue
soup = bs(r.content,'lxml')
print('Got soup object from: ',str(col['file_url_html']),str(col['cik']))
if soup.text and str(item8k) in soup.text.lower():
try:
r = requests_retry_session().get('https://www.sec.gov/Archives/' + col['file_url_txt'])
except:
print(str(type(ex).__name__),str(ex.args))
with open(os.getcwd()+'/'+'errors.csv','a') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([str(col['file_url_html']),str(type(ex).__name__),str(ex.args)])
continue
soup = bs(r.content,'lxml')
print('Got your filing item from: ',str(col['file_url_txt']),str(col['cik']))
try:
with open(os.getcwd()+'/'+str(col['cik'])+'_'+str(re.sub(r'[\\/]+','',str(col['firm_name'])))+'_'+
str(col['report_date'].date())+'_8K_item_'+str(item8k)+'.html','a') as file:
file.write(soup.prettify())
print('html file is done. Name: ',str(os.getcwd()+'/'+str(col['cik'])+'_'+str(re.sub(r'[\\/]+','',
str(col['firm_name'])))+'_'+str(col['report_date'].date())+'_8K_item_'+str(item8k)+'.html'))
except Exception as ex:
print(str(type(ex).__name__),str(ex.args))
with open(os.getcwd()+'/'+'errors.csv','a') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([str(col['file_url_html']),str(type(ex).__name__),str(ex.args)])
continue
发布于 2020-07-26 07:21:26
创建您的公司名称(或符号或CIKS)的列表。如果您在excel中有一个列表,请将其转换为csv并执行以下操作:
companies = []
with open('/Path', newline='', encoding='utf-
8-sig') as f:
for row in csv.reader(f):
companies.append(row[0])
然后,快速浏览该列表以获取文件:
dl = Downloader(Path)
for company in companies:
dl.get("File Type"), company
发布于 2021-01-11 10:43:34
您也可以使用SEC Filings API。
您可以检索实时和历史SEC备案数据。它涵盖了所有类型的上市公司、共同基金和其他私人配售。
https://stackoverflow.com/questions/61483527
复制相似问题