Hi this is quite a complicated question
From this script:
workbook = xlsxwriter.Workbook("test.xlsx")
worksheet = workbook.add_worksheet("Stocks")
stock=[here goes a list of 2000+ stock tickers as strings]
sector = []
peg_ratio = []
foward_eps = []
for idx in range(len(stock)):
url_profile='https://finance.yahoo.com/quote/{}/profile?p={}'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
response =requests.get(url_profile.format(stock[idx],stock[idx]),headers=headers)
soup = BeautifulSoup(response.text,'html.parser')
pattern = re.compile(r's--sDatas--s')
script_data = soup.find('script',text=pattern).contents[0]
start = script_data.find("context")-2
json_data=json.loads(script_data[start:-12])
try:
sector.append(json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile']['industry'])
except:
sector.append("Error")
url_stats = 'https://finance.yahoo.com/quote/{}/key-statistics?p={}'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
response = requests.get(url_stats.format(stock[idx], stock[idx]), headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
pattern = re.compile(r's--sDatas--s')
script_data = soup.find('script', text=pattern).contents[0]
start = script_data.find("context") - 2
json_data = json.loads(script_data[start:-12])
try:
peg_ratio.append(
json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']['pegRatio'][
'fmt'])
except:
peg_ratio.append("Error")
try:
foward_eps.append(
json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']['forwardEps'][
'fmt'])
except:
foward_eps.append("Error")
worksheet.write("A" + str(idx + 1), stock[idx])
worksheet.write("B" + str(idx + 1), sector[idx])
worksheet.write("C" + str(idx+1), foward_eps[idx])
worksheet.write("D" + str(idx + 1), peg_ratio[idx])
workbook.close()
The code by itself do what is supposed to do (getting the data:foward eps,peg ratio,sector and paste them on an excel file)but the issue is that it takes a lot of time and the list stock is quite long(2531 elements) is there a way to make this code more efficent or faster?
I have attemped to follow the instruction from this video:https://www.youtube.com/watch?v=nFn4_nA_yk8
But i still need to write the information of any single stock in the excel file is there a way i can optimize all of this ?
Maybe by sending multiple request at the same time and write the data on the excel on a different time?
The only end goal is to make the whole process as fast as possible.
Thanks in advance(if you need any other information leave a comment i will answer as soon as possible)
Answers:
Thank you for visiting the Q&A section on Magenaut. Please note that all the answers may not help you solve the issue immediately. So please treat them as advisements. If you found the post helpful (or not), leave a comment & I’ll get back to you as soon as possible.
Method 1
First you have to put code in function
# --- globals ---
url_profile = 'https://finance.yahoo.com/quote/{}/profile?p={}'
url_stats = 'https://finance.yahoo.com/quote/{}/key-statistics?p={}'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
}
pattern = re.compile(r's--sDatas--s')
# --- functions ---
def process(number, stock_name):
print(f'{number} {stock_name}n', end='', flush=True)
url = url_profile.format(stock_name, stock_name)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text,'html.parser')
script_data = soup.find('script', text=pattern).contents[0]
start = script_data.find("context")-2
data = json.loads(script_data[start:-12])
try:
sector = data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile']['industry']
except:
sector = "Error"
url = url_stats.format(stock_name, stock_name)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
script_data = soup.find('script', text=pattern).contents[0]
start = script_data.find("context") - 2
data = json.loads(script_data[start:-12])
try:
peg_ratio = data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']['pegRatio']['fmt']
except:
peg_ratio = "Error"
try:
foward_eps = data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']['forwardEps']['fmt']
except:
foward_eps = "Error"
# return data - for thread
results[number] = (stock_name, sector, foward_eps, foward_eps, peg_ratio)
# return data - for normal execution
return (stock_name, sector, foward_eps, foward_eps, peg_ratio)
And next you can run it in old way
stock = ['AAPL', 'GOOG', 'TESL', 'MSFT', 'AAPL', 'GOOG', 'TESL', 'MSFT']
_start = time.time()
results = {}
workbook = xlsxwriter.Workbook("test.xlsx")
worksheet = workbook.add_worksheet("Stocks")
for number, stock_name in enumerate(stock, 1):
data = process(number, stock_name)
worksheet.write(f"A{number}", data[0]) #stock_name
worksheet.write(f"B{number}", data[1]) #sector
worksheet.write(f"C{number}", data[2]) #foward_eps
worksheet.write(f"D{number}", data[3]) #peg_ratio
workbook.close()
_end = time.time()
print(_end - _start)
and this gives me time ~15s, (but sometimes even ~32s)
And now you can use threading to run the same function with different values at the same time.
Because thread can’t return result directly so I use global dictionary results for this (because threads share memory).
stock = ['AAPL', 'GOOG', 'TESL', 'MSFT', 'AAPL', 'GOOG', 'TESL', 'MSFT']
_start = time.time()
threads = []
results = {}
workbook = xlsxwriter.Workbook("test.xlsx")
worksheet = workbook.add_worksheet("Stocks")
# start all threads
for number, stock_name in enumerate(stock, 1):
t = threading.Thread(target=process, args=(number, stock_name))
t.start()
threads.append(t)
# wait for end of all threads
for t in threads:
t.join()
# use results
for number, data in results.items():
#(stock_name, sector, foward_eps, foward_eps, peg_ratio) = data
worksheet.write(f"A{number}", data[0]) #stock_name
worksheet.write(f"B{number}", data[1]) #sector
worksheet.write(f"C{number}", data[2]) #foward_eps
worksheet.write(f"D{number}", data[3]) #peg_ratio
workbook.close()
_end = time.time()
print(_end - _start)
And this gives me time ~6s
For more stocks it would be better to use Threading.Pool so it would run only few threads at the same time because running 2000+ threads at the same time is not good idea.
Full working code
import requests
import time
import xlsxwriter
import re
from bs4 import BeautifulSoup
import json
import threading
# --- globals ---
url_profile = 'https://finance.yahoo.com/quote/{}/profile?p={}'
url_stats = 'https://finance.yahoo.com/quote/{}/key-statistics?p={}'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
}
pattern = re.compile(r's--sDatas--s')
# --- functions ---
def process(number, stock_name):
print(f'{number} {stock_name}n', end='', flush=True)
url = url_profile.format(stock_name, stock_name)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text,'html.parser')
script_data = soup.find('script', text=pattern).contents[0]
start = script_data.find("context")-2
data = json.loads(script_data[start:-12])
try:
sector = data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile']['industry']
except:
sector = "Error"
url = url_stats.format(stock_name, stock_name)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
script_data = soup.find('script', text=pattern).contents[0]
start = script_data.find("context") - 2
data = json.loads(script_data[start:-12])
try:
peg_ratio = data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']['pegRatio']['fmt']
except:
peg_ratio = "Error"
try:
foward_eps = data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']['forwardEps']['fmt']
except:
foward_eps = "Error"
# return data - for thread
results[number] = (stock_name, sector, foward_eps, foward_eps, peg_ratio)
# return data - for normal execution
return (stock_name, sector, foward_eps, foward_eps, peg_ratio)
# --- main ---
stock = [
'AAPL', 'GOOG', 'TESL', 'MSFT',
'AAPL', 'GOOG', 'TESL', 'MSFT',
'AAPL', 'GOOG', 'TESL', 'MSFT',
'AAPL', 'GOOG', 'TESL', 'MSFT',
]
# --- old version ---
_start = time.time()
results = {}
workbook = xlsxwriter.Workbook("test.xlsx")
worksheet = workbook.add_worksheet("Stocks")
for number, stock_name in enumerate(stock, 1):
data = process(number, stock_name)
#(stock_name, sector, foward_eps, foward_eps, peg_ratio) = data
worksheet.write(f"A{number}", data[0]) #stock_name
worksheet.write(f"B{number}", data[1]) #sector
worksheet.write(f"C{number}", data[2]) #foward_eps
worksheet.write(f"D{number}", data[3]) #peg_ratio
workbook.close()
_end = time.time()
print(_end - _start)
# --- new version ---
_start = time.time()
threads = []
results = {}
workbook = xlsxwriter.Workbook("test.xlsx")
worksheet = workbook.add_worksheet("Stocks")
# start all threads
for number, stock_name in enumerate(stock, 1):
t = threading.Thread(target=process, args=(number, stock_name))
t.start()
threads.append(t)
# wait for end of all threads
for t in threads:
t.join()
# use results
for number, data in results.items():
#(stock_name, sector, foward_eps, foward_eps, peg_ratio) = data
worksheet.write(f"A{number}", data[0]) #stock_name
worksheet.write(f"B{number}", data[1]) #sector
worksheet.write(f"C{number}", data[2]) #foward_eps
worksheet.write(f"D{number}", data[3]) #peg_ratio
workbook.close()
_end = time.time()
print(_end - _start)
Version with Pool
import requests
import time
import xlsxwriter
import re
from bs4 import BeautifulSoup
import json
import threading
import threading
from multiprocessing.pool import ThreadPool
# --- globals ---
url_profile = 'https://finance.yahoo.com/quote/{}/profile?p={}'
url_stats = 'https://finance.yahoo.com/quote/{}/key-statistics?p={}'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
}
pattern = re.compile(r's--sDatas--s')
# --- functions ---
def process(number, stock_name):
print(f'{number} {stock_name}n', end='', flush=True)
url = url_profile.format(stock_name, stock_name)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text,'html.parser')
script_data = soup.find('script', text=pattern).contents[0]
start = script_data.find("context")-2
data = json.loads(script_data[start:-12])
try:
sector = data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile']['industry']
except:
sector = "Error"
url = url_stats.format(stock_name, stock_name)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
script_data = soup.find('script', text=pattern).contents[0]
start = script_data.find("context") - 2
data = json.loads(script_data[start:-12])
try:
peg_ratio = data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']['pegRatio']['fmt']
except:
peg_ratio = "Error"
try:
foward_eps = data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']['forwardEps']['fmt']
except:
foward_eps = "Error"
# return data - for thread
results[number] = (stock_name, sector, foward_eps, foward_eps, peg_ratio)
# return data - for normal execution
return (stock_name, sector, foward_eps, foward_eps, peg_ratio)
# --- main ---
stock = [
'AAPL', 'GOOG', 'TESL', 'MSFT',
'AAPL', 'GOOG', 'TESL', 'MSFT',
'AAPL', 'GOOG', 'TESL', 'MSFT',
'AAPL', 'GOOG', 'TESL', 'MSFT',
]
_start = time.time()
results = {}
workbook = xlsxwriter.Workbook("test.xlsx")
worksheet = workbook.add_worksheet("Stocks")
with ThreadPool(processes=10) as pool:
pool_results = pool.starmap_async(process, enumerate(stock, 1))
pool_results = pool_results.get()
for number, data in enumerate(pool_results, 1):
#(stock_name, sector, foward_eps, foward_eps, peg_ratio) = data
worksheet.write(f"A{number}", data[0]) #stock_name
worksheet.write(f"B{number}", data[1]) #sector
worksheet.write(f"C{number}", data[2]) #foward_eps
worksheet.write(f"D{number}", data[3]) #peg_ratio
workbook.close()
_end = time.time()
print(_end - _start)
All methods was sourced from stackoverflow.com or stackexchange.com, is licensed under cc by-sa 2.5, cc by-sa 3.0 and cc by-sa 4.0