raise ValueError(err) - Implementation of multithreading using concurrent.future in Python - python

I have written a python code which scrape information from a website. I tried to apply multi-thread method in my code. Here's my code before applying multithreading: It run perfectly on my PC.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import investpy
def getCurrencyHistorical():
t1 = time.perf_counter()
headers = {'Accept-Language': 'en-US,en;q=0.9',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 Edg/88.0.705.63',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive'}
links = {"USD-IDR":"https://www.investing.com/currencies/usd-idr-historical-data",
"USD-JPY":"https://www.investing.com/currencies/usd-jpy-historical-data",
"USD-CNY":"https://www.investing.com/currencies/usd-cny-historical-data"}
column = []
output = []
for key, value in links.items():
page = requests.get(value, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
table =soup.select('table')[0]
#ColumnName
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('th')
cols = [item.text.strip() for item in cols]
column.append(cols)
outs = row.find_all('td')
outs = [item.text.strip() for item in outs]
outs.append(key)
output.append(outs)
del output[0]
#print(value)
#print(output)
column[0].append('Currency')
df = pd.DataFrame(output, columns = column[0])
t2 = time.perf_counter()
print(f'Finished in {t2-t1} seconds')
return(df)
But, when I convert to below, I got some error. here's the code after applying multithreading:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import concurrent.futures
from functools import partial
import psutil
def process_data(key, page):
soup = BeautifulSoup(page, 'html.parser')
table =soup.select('table')[0]
#ColumnName
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('th')
cols = [item.text.strip() for item in cols]
outs = row.find_all('td')
outs = [item.text.strip() for item in outs]
outs.append(key)
return cols, outs
def getCurrencyHistorical(session, pool_executor, item):
key, value = item
page = session.get(value)
f = pool_executor.submit(process_data, key, page.content)
return f.result()
def main():
t1 = time.perf_counter()
links = {"USD-IDR":"https://www.investing.com/currencies/usd-idr-historical-data",
"USD-JPY":"https://www.investing.com/currencies/usd-jpy-historical-data",
"USD-CNY":"https://www.investing.com/currencies/usd-cny-historical-data"}
with requests.Session() as session:
user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.37"
session.headers = {'User-Agent': user_agent}
column = []
output = []
with concurrent.futures.ProcessPoolExecutor(psutil.cpu_count(logical=False)) as pool_executor, \
concurrent.futures.ThreadPoolExecutor(max_workers=len(links)) as executor:
for return_value in executor.map(partial(getCurrencyHistorical, session, pool_executor), links.items()):
cols, outs = return_value
column.append(cols)
output.append(outs)
del output[0]
column[0].append('Currency')
df = pd.DataFrame(output, columns = column[0])
t2 = time.perf_counter()
print(f'Finished in {t2-t1} seconds')
print(df)
# Required for Windows:
if __name__ == '__main__':
main()
I got error raise ValueError(err) from err. ValueError: 1 columns passed, passed data had 7 columns. and it comes from the line df = pd.DataFrame(output, columns = column[0]). What is wrong? Thank you.

process_data should be just like the non-multiprocessing case except for the fact it is only processing one key-value pair, but that's not what you have done. The main process now must do extend operations on the lists returned by process_data.
Update
You were not retrieving the data items for key "USD-JPY" because you were not looking at the correct table. You should be looking at the table with id 'curr_table'. I have also updated the multiprocessing pool size per my comment to your question.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import concurrent.futures
from functools import partial
from os import cpu_count
def process_data(key, page):
soup = BeautifulSoup(page, 'html.parser')
table = soup.find('table', {'id': 'curr_table'})
#ColumnName
rows = table.find_all('tr')
column = []
output = []
for row in rows:
cols = row.find_all('th')
cols = [item.text.strip() for item in cols]
column.append(cols)
outs = row.find_all('td')
outs = [item.text.strip() for item in outs]
outs.append(key)
output.append(outs)
del output[0]
return column, output
def getCurrencyHistorical(session, pool_executor, item):
key, value = item
page = session.get(value)
f = pool_executor.submit(process_data, key, page.content)
return f.result()
def main():
t1 = time.perf_counter()
links = {"USD-IDR":"https://www.investing.com/currencies/usd-idr-historical-data",
"USD-JPY":"https://www.investing.com/currencies/usd-jpy-historical-data",
"USD-CNY":"https://www.investing.com/currencies/usd-cny-historical-data"}
with requests.Session() as session:
user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.37"
session.headers = {'User-Agent': user_agent}
column = []
output = []
with concurrent.futures.ProcessPoolExecutor(min(len(links), cpu_count())) as pool_executor, \
concurrent.futures.ThreadPoolExecutor(max_workers=len(links)) as executor:
for return_value in executor.map(partial(getCurrencyHistorical, session, pool_executor), links.items()):
cols, outs = return_value
column.extend(cols)
output.extend(outs)
column[0].append('Currency')
df = pd.DataFrame(output, columns = column[0])
t2 = time.perf_counter()
print(f'Finished in {t2-t1} seconds')
pd.set_option("display.max_rows", None, "display.max_columns", None)
print(df)
# Required for Windows:
if __name__ == '__main__':
main()
Prints:
Finished in 2.1944901 seconds
Date Price Open High Low Change % Currency
0 Aug 26, 2021 14,417.5 14,425.0 14,430.0 14,411.0 0.16% USD-IDR
1 Aug 25, 2021 14,395.0 14,405.0 14,421.0 14,387.5 0.03% USD-IDR
2 Aug 24, 2021 14,390.0 14,395.0 14,407.5 14,377.5 -0.14% USD-IDR
3 Aug 23, 2021 14,410.0 14,435.0 14,438.5 14,404.0 -0.28% USD-IDR
4 Aug 20, 2021 14,450.0 14,475.0 14,485.0 14,422.5 0.35% USD-IDR
5 Aug 19, 2021 14,400.0 14,405.0 14,425.0 14,392.5 0.21% USD-IDR
6 Aug 18, 2021 14,370.0 14,387.5 14,400.0 14,372.5 0.00% USD-IDR
7 Aug 16, 2021 14,370.0 14,390.0 14,395.0 14,371.5 -0.10% USD-IDR
8 Aug 13, 2021 14,385.0 14,382.5 14,395.0 14,366.0 0.03% USD-IDR
9 Aug 12, 2021 14,380.0 14,395.0 14,407.5 14,366.0 0.00% USD-IDR
10 Aug 10, 2021 14,380.0 14,375.0 14,402.0 14,375.0 0.14% USD-IDR
11 Aug 09, 2021 14,360.0 14,370.0 14,387.5 14,357.5 0.07% USD-IDR
12 Aug 06, 2021 14,350.0 14,360.0 14,377.5 14,347.5 0.07% USD-IDR
13 Aug 05, 2021 14,340.0 14,330.0 14,360.0 14,321.0 0.21% USD-IDR
14 Aug 04, 2021 14,310.0 14,325.0 14,347.5 14,304.5 -0.21% USD-IDR
15 Aug 03, 2021 14,340.0 14,375.0 14,388.0 14,338.5 -0.55% USD-IDR
16 Aug 02, 2021 14,420.0 14,465.0 14,472.5 14,422.5 -0.28% USD-IDR
17 Jul 30, 2021 14,460.0 14,435.0 14,477.5 14,434.5 -0.14% USD-IDR
18 Jul 29, 2021 14,480.0 14,490.0 14,502.5 14,482.5 -0.03% USD-IDR
19 Jul 28, 2021 14,485.0 14,500.0 14,512.5 14,485.0 -0.03% USD-IDR
20 Jul 27, 2021 14,490.0 14,473.5 14,497.5 14,465.0 0.07% USD-IDR
21 Jul 26, 2021 14,480.0 14,510.0 14,522.5 14,470.0 -0.07% USD-IDR
22 Aug 26, 2021 110.10 109.98 110.23 109.93 0.10% USD-JPY
23 Aug 25, 2021 109.99 109.64 110.13 109.61 0.34% USD-JPY
24 Aug 24, 2021 109.62 109.69 109.89 109.41 -0.05% USD-JPY
25 Aug 23, 2021 109.68 109.81 110.15 109.65 -0.11% USD-JPY
26 Aug 20, 2021 109.80 109.75 109.89 109.57 0.07% USD-JPY
27 Aug 19, 2021 109.72 109.76 110.23 109.49 -0.02% USD-JPY
28 Aug 18, 2021 109.74 109.57 110.07 109.47 0.16% USD-JPY
29 Aug 17, 2021 109.57 109.22 109.66 109.12 0.31% USD-JPY
30 Aug 16, 2021 109.23 109.71 109.76 109.11 -0.31% USD-JPY
31 Aug 13, 2021 109.57 110.39 110.46 109.54 -0.73% USD-JPY
32 Aug 12, 2021 110.38 110.42 110.55 110.31 -0.02% USD-JPY
33 Aug 11, 2021 110.40 110.58 110.81 110.31 -0.14% USD-JPY
34 Aug 10, 2021 110.56 110.29 110.60 110.28 0.25% USD-JPY
35 Aug 09, 2021 110.28 110.26 110.36 110.02 0.03% USD-JPY
36 Aug 06, 2021 110.25 109.77 110.36 109.69 0.46% USD-JPY
37 Aug 05, 2021 109.74 109.49 109.79 109.40 0.25% USD-JPY
38 Aug 04, 2021 109.47 109.07 109.68 108.72 0.39% USD-JPY
39 Aug 03, 2021 109.04 109.32 109.36 108.88 -0.22% USD-JPY
40 Aug 02, 2021 109.28 109.69 109.79 109.18 -0.38% USD-JPY
41 Jul 30, 2021 109.70 109.49 109.83 109.36 0.22% USD-JPY
42 Jul 29, 2021 109.46 109.91 109.96 109.42 -0.40% USD-JPY
43 Jul 28, 2021 109.90 109.75 110.29 109.74 0.13% USD-JPY
44 Jul 27, 2021 109.76 110.36 110.41 109.58 -0.53% USD-JPY
45 Jul 26, 2021 110.34 110.57 110.59 110.11 -0.18% USD-JPY
46 Aug 26, 2021 6.4815 6.4725 6.4866 6.4725 0.09% USD-CNY
47 Aug 25, 2021 6.4756 6.4714 6.4811 6.4707 0.07% USD-CNY
48 Aug 24, 2021 6.4710 6.4790 6.4851 6.4676 -0.15% USD-CNY
49 Aug 23, 2021 6.4805 6.4915 6.4973 6.4788 -0.32% USD-CNY
50 Aug 20, 2021 6.5012 6.4960 6.5057 6.4935 0.11% USD-CNY
51 Aug 19, 2021 6.4942 6.4847 6.4997 6.4840 0.16% USD-CNY
52 Aug 18, 2021 6.4841 6.4861 6.4872 6.4776 -0.02% USD-CNY
53 Aug 17, 2021 6.4854 6.4787 6.4889 6.4759 0.17% USD-CNY
54 Aug 16, 2021 6.4742 6.4774 6.4810 6.4719 -0.04% USD-CNY
55 Aug 13, 2021 6.4768 6.4778 6.4854 6.4749 -0.02% USD-CNY
56 Aug 12, 2021 6.4782 6.4767 6.4811 6.4719 -0.00% USD-CNY
57 Aug 11, 2021 6.4783 6.4846 6.4894 6.4752 -0.11% USD-CNY
58 Aug 10, 2021 6.4852 6.4826 6.4875 6.4774 -0.01% USD-CNY
59 Aug 09, 2021 6.4857 6.4835 6.4895 6.4731 0.05% USD-CNY
60 Aug 06, 2021 6.4825 6.4660 6.4848 6.4622 0.34% USD-CNY
61 Aug 05, 2021 6.4608 6.4671 6.4677 6.4595 -0.07% USD-CNY
62 Aug 04, 2021 6.4655 6.4662 6.4673 6.4555 -0.07% USD-CNY
63 Aug 03, 2021 6.4700 6.4656 6.4710 6.4604 0.12% USD-CNY
64 Aug 02, 2021 6.4620 6.4615 6.4693 6.4580 0.02% USD-CNY
65 Jul 30, 2021 6.4609 6.4645 6.4693 6.4506 0.07% USD-CNY
66 Jul 29, 2021 6.4562 6.4908 6.4908 6.4544 -0.53% USD-CNY
67 Jul 28, 2021 6.4905 6.5095 6.5101 6.4891 -0.31% USD-CNY
68 Jul 27, 2021 6.5104 6.4760 6.5132 6.4735 0.43% USD-CNY
69 Jul 26, 2021 6.4825 6.4790 6.4875 6.4785 0.03% USD-CNY

Related

Iterate trough a converted datetime pandas dataframe with a external function

https://rhodesmill.org/skyfield/positions.html#azimuth-and-altitude-from-a-geographic-position
Hi I have function that generates a sun-shot azimuth on a specific date and time on a specific place, using the package skyfield for astronomical calculations.
What do I want:
iterate trough the df2.cdt rows as fix time,
currently astro = Nieuwe_diep.at(ts.utc(2019, 12, 31, 8, 41, 44)).observe(sun)
for fix#1
add a new column to df2, called: df2.azimuth containing the output of az trough row itteration.
Currently I can only generate the azimuth for the first fix, with this code:
# Sunshot Azimuth - hour angle method
from skyfield.api import N,S,E,W, wgs84
from skyfield.api import load
import pandas as pd
# location Port of Den Helder, Nieuwe diep:
lat = 52+(57/60)+(26.9/3600)
lon = 4+(46/60)+(37.5/3600)
# fix1 # 2019-12-31 08:41:44 UTC
ts = load.timescale()
t = ts.utc(2019, 12, 31)
planets = load('de421.bsp')
earth, sun = planets['earth'], planets['sun']
# Altitude and azimuth in the sky for a specific geographic location
earth = planets['earth']
Nieuwe_diep = earth + wgs84.latlon(lat * N, lon * E, elevation_m=6)
astro = Nieuwe_diep.at(ts.utc(2019, 12, 31, 8, 41, 44)).observe(sun)
app = astro.apparent()
alt, az, distance = app.altaz()
print('alt: ' + alt.dstr())
print('az: ' + az.dstr())
print(distance)
print('lat, lon: ' + str(lat), str(lon))
#dt_utc = df2['datetime_UTC']
print('az: {:.3f}'.format(az.degrees)) # desired output for azimuth in decimal degrees
print('az: '+ az.dstr(format=u'{0}{1}°{2:02}′{3:02}.{4:0{5}}″'))
which results in:
alt: 04deg 18' 42.2"
az: 138deg 52' 22.3"
0.983305 au
lat, lon: 52.95747222222222 4.777083333333334
az: 138.873
az: 138°52′22.3″
I have a pandas dataframe that consists of times of when I want to know the suns Azimuth. The column cdt
# cdt: converted datetime
df2['cdt'] = df2['datetime_UTC'].dt.strftime('%Y, %m, %d, %H, %M, %S')
print(df2)
cdt = df2.cdt
date time datetime_UTC cdt
0 2019-12-31 08:41:44 2019-12-31 08:41:44 2019, 12, 31, 08, 41, 44
1 2019-12-31 08:43:16 2019-12-31 08:43:16 2019, 12, 31, 08, 43, 16
2 2019-12-31 08:44:12 2019-12-31 08:44:12 2019, 12, 31, 08, 44, 12
3 2019-12-31 08:44:52 2019-12-31 08:44:52 2019, 12, 31, 08, 44, 52
4 2019-12-31 08:46:01 2019-12-31 08:46:01 2019, 12, 31, 08, 46, 01
5 2019-12-31 08:46:42 2019-12-31 08:46:42 2019, 12, 31, 08, 46, 42
6 2019-12-31 08:47:21 2019-12-31 08:47:21 2019, 12, 31, 08, 47, 21
7 2019-12-31 08:48:12 2019-12-31 08:48:12 2019, 12, 31, 08, 48, 12
8 2019-12-31 08:48:58 2019-12-31 08:48:58 2019, 12, 31, 08, 48, 58
9 2019-12-31 09:07:08 2019-12-31 09:07:08 2019, 12, 31, 09, 07, 08
10 2019-12-31 09:07:24 2019-12-31 09:07:24 2019, 12, 31, 09, 07, 24
11 2019-12-31 09:07:45 2019-12-31 09:07:45 2019, 12, 31, 09, 07, 45
12 2019-12-31 09:08:03 2019-12-31 09:08:03 2019, 12, 31, 09, 08, 03
13 2019-12-31 09:08:19 2019-12-31 09:08:19 2019, 12, 31, 09, 08, 19
14 2019-12-31 09:08:34 2019-12-31 09:08:34 2019, 12, 31, 09, 08, 34
15 2019-12-31 09:08:50 2019-12-31 09:08:50 2019, 12, 31, 09, 08, 50
16 2019-12-31 09:09:13 2019-12-31 09:09:13 2019, 12, 31, 09, 09, 13
17 2019-12-31 09:09:33 2019-12-31 09:09:33 2019, 12, 31, 09, 09, 33
18 2019-12-31 09:09:57 2019-12-31 09:09:57 2019, 12, 31, 09, 09, 57
19 2019-12-31 09:10:20 2019-12-31 09:10:20 2019, 12, 31, 09, 10, 20
I think this would work. You'd have to take the output and deal with that in a list, dictionary or some other dataframe. Also, there seems like there should be a better way to pass and parse the utc time but I'm not familiar with the library.
import io
data = '''date time datetime_UTC
2019-12-31 08:41:44 2019-12-31 08:41:44
2019-12-31 08:43:16 2019-12-31 08:43:16
2019-12-31 08:44:12 2019-12-31 08:44:12
'''
df2 = pd.read_csv(io.StringIO(data), sep=' \s+', engine='python')
df2['datetime_UTC'] = pd.to_datetime(df2['datetime_UTC'])
df2['cdt'] = df2['datetime_UTC'].dt.strftime('%Y,%m,%d,%H,%M,%S')
# note I changed the formatting to remove spaces for later parsing
def calc_az(tutc):
yr=int(tutc.split(',')[0])
mo=int(tutc.split(',')[1])
da=int(tutc.split(',')[2])
hr=int(tutc.split(',')[3])
mi=int(tutc.split(',')[4])
se=int(tutc.split(',')[5])
# location Port of Den Helder, Nieuwe diep:
lat = 52+(57/60)+(26.9/3600)
lon = 4+(46/60)+(37.5/3600)
# fix1 # 2019-12-31 08:41:44 UTC
ts = load.timescale()
t = ts.utc(2019, 12, 31)
planets = load('de421.bsp')
earth, sun = planets['earth'], planets['sun']
# Altitude and azimuth in the sky for a specific geographic location
earth = planets['earth']
Nieuwe_diep = earth + wgs84.latlon(lat * N, lon * E, elevation_m=6)
# astro = Nieuwe_diep.at(ts.utc(2019, 12, 31, 8, 41, 44)).observe(sun)
astro = Nieuwe_diep.at(ts.utc(yr, mo, da, hr, mi, se)).observe(sun)
app = astro.apparent()
alt, az, distance = app.altaz()
print('alt: ' + alt.dstr())
print('az: ' + az.dstr())
print(distance)
print('lat, lon: ' + str(lat), str(lon))
#dt_utc = df2['datetime_UTC']
print('az: {:.3f}'.format(az.degrees)) # desired output for azimuth in decimal degrees
print('az: '+ az.dstr(format=u'{0}{1}°{2:02}′{3:02}.{4:0{5}}″'))
print('\n'*2)
return
df2['cdt'].apply(calc_az)
output
alt: 04deg 18' 42.2"
az: 138deg 52' 22.3"
0.983305 au
lat, lon: 52.95747222222222 4.777083333333334
az: 138.873
az: 138°52′22.3″
alt: 04deg 27' 47.3"
az: 139deg 11' 31.5"
0.983305 au
lat, lon: 52.95747222222222 4.777083333333334
az: 139.192
az: 139°11′31.5″
alt: 04deg 33' 17.4"
az: 139deg 23' 11.9"
0.983305 au
lat, lon: 52.95747222222222 4.777083333333334
az: 139.387
az: 139°23′11.9″

Parsing a table data from BSE site into Python

I am new to python. I want to parse a data from a table in BSE site into python.
I tried using beautifulsoup module but I am unable to know which reference to use, so as to find the correct table. In fact even that particular table row is not getting displayed in python
The code that I tried was:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
page = 'https://www.bseindia.com/stock-share-price/itc-ltd/itc/500875/corp-actions/'
req = Request(page, headers = {'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
page_soup = soup(webpage, "html.parser")
containers = page_soup.findAll("table", id = "tblinsidertrd")
This is giving a blank [ ] result.
Then I tried
containers = page_soup.findAll('td')
containers = page_soup.findAll('tr)
In both results I was unable to find the table or data I was looking for. I couldn't even find the table headings viz 'EX Date' and 'Amount'
The table that I want from BSE site is highlighted below:
Please help me as to where I am going wrong and why I am unable to view the dividend table data?
The content is dynamically generated. You can pull it form the api:
import pandas as pd
import requests
url = 'https://api.bseindia.com/BseIndiaAPI/api/CorporateAction/w?scripcode=500875'
headers = {'User-Agent': 'Mozilla/5.0'}
jsonData = requests.get(url, headers=headers).json()
df = pd.DataFrame(jsonData['Table'])
Output:
print(df)
Amount BCRD_from purpose_name
0 10.15 06 Jul 2020 Dividend
1 5.75 22 May 2019 Dividend
2 5.15 25 May 2018 Dividend
3 4.75 05 Jun 2017 Dividend
4 8.50 30 May 2016 Dividend
5 6.25 03 Jun 2015 Dividend
6 6.00 03 Jun 2014 Dividend
7 5.25 31 May 2013 Dividend
8 4.50 11 Jun 2012 Dividend
9 2.80 10 Jun 2011 Dividend
10 1.65 10 Jun 2011 Special Dividend
11 10.00 09 Jun 2010 Dividend
12 3.70 13 Jul 2009 Dividend
13 3.50 16 Jul 2008 Dividend
14 3.10 16 Jul 2007 Dividend
15 10.00 03 Jul 2001 Dividend

Scrape website whose url doesn't change [python with beautiful soup]

I am totally new to web scraping.
how can i scrape a website, whose url doesn't change with the page number?
suppose take this website- https://www.bseindia.com/corporates/Forth_Results.aspx
the url doesn't change with page number,
this is same as what i am asking, how can we do it using beautiful soup in python??
This script wi
import requests
from bs4 import BeautifulSoup
url = 'https://www.bseindia.com/corporates/Forth_Results.aspx'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
page = 1
while True:
print(page)
rows = soup.select('.TTRow')
if not rows:
break
# print some data to screen:
for tr in rows:
print(tr.get_text(strip=True, separator=' '))
# to get correct page, you have to do POST request with correct data
# the data is located in <input name="..." value=".."> tags
d = {}
for i in soup.select('input'):
d[i['name']] = i.get('value', '')
# some data parameters needs to be deleted:
if 'ctl00$ContentPlaceHolder1$btnSubmit' in d:
del d['ctl00$ContentPlaceHolder1$btnSubmit']
# set correct page:
page += 1
d['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$gvData'
d['__EVENTARGUMENT'] = 'Page${}'.format(page)
soup = BeautifulSoup(requests.post(url, headers=headers, data=d).content, 'html.parser')
Prints:
1
500002 ABB 23 Jul 2020
531082 ALANKIT 23 Jul 2020
535916 ALSL 23 Jul 2020
526662 ARENTERP 23 Jul 2020
500215 ATFL 23 Jul 2020
540611 AUBANK 23 Jul 2020
532523 BIOCON 23 Jul 2020
533167 COROENGG 23 Jul 2020
532839 DISHTV 23 Jul 2020
500150 FOSECOIND 23 Jul 2020
507488 GMBREW 23 Jul 2020
532855 HARYNACAP 23 Jul 2020
541729 HDFCAMC 23 Jul 2020
524342 INDOBORAX 23 Jul 2020
522183 ITL 23 Jul 2020
534623 JUPITERIN 23 Jul 2020
533192 KCPSUGIND 23 Jul 2020
542753 MAHAANIMP 23 Jul 2020
532525 MAHABANK 23 Jul 2020
523754 MAHEPC 23 Jul 2020
531680 MAYUR 23 Jul 2020
526299 MPHASIS 23 Jul 2020
532416 NEXTMEDIA 23 Jul 2020
502294 NILACHAL 23 Jul 2020
538772 NIYOGIN 23 Jul 2020
2
530805 OIVL 23 Jul 2020
538742 PANACHE 23 Jul 2020
531879 PIONDIST 23 Jul 2020
540173 PNBHOUSING 23 Jul 2020
533178 PRADIP 23 Jul 2020
...and so on.
EDIT: To save it as CSV, you can use this:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.bseindia.com/corporates/Forth_Results.aspx'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
page = 1
all_data = []
while True:
print(page)
rows = soup.select('.TTRow')
if not rows:
break
# print some data to screen:
for tr in rows:
row = tr.get_text(strip=True, separator='|').split('|')
all_data.append(row)
# to get correct page, you have to do POST request with correct data
# the data is located in <input name="..." value=".."> tags
d = {}
for i in soup.select('input'):
d[i['name']] = i.get('value', '')
# some data parameters needs to be deleted:
if 'ctl00$ContentPlaceHolder1$btnSubmit' in d:
del d['ctl00$ContentPlaceHolder1$btnSubmit']
# set correct page:
page += 1
d['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$gvData'
d['__EVENTARGUMENT'] = 'Page${}'.format(page)
soup = BeautifulSoup(requests.post(url, headers=headers, data=d).content, 'html.parser')
df = pd.DataFrame(all_data)
print(df)
df.to_csv('data.csv')
Produces data.csv (screenshot from LibreOffice):

Python time delta

Please let me know if I am doing this part correctly. I am trying to grab files ONLY modified in the past 24 hours. However my output is ALL files in the directory regardless of modified time:
yesterday = date.today() - timedelta(days=1)
dayToStr = yesterday.strftime('%Y%m%d')
file_list_attr = sftp.listdir_attr()
for file in file_list_attr:
if file.st_mtime <= dayToStr:
print file
Output
-rw-r--r-- 1 4012 60 3404961 09 Jan 18:32 2_YEAR_912828UD0_20130109.dat
-rw-r--r-- 1 4012 60 10206411 09 Jan 18:32 3_YEAR_912828UG3_20130109.dat
-rw-r--r-- 1 4012 60 68311760 09 Jan 18:34 5_YEAR_912828UE8_20130109.dat
-rw-r--r-- 1 4012 60 54215712 09 Jan 18:35 7_YEAR_912828UF5_20130109.dat
-rw-r--r-- 1 4012 60 88014103 09 Jan 18:37 10_YEAR_912828TY6_20130109.dat
-rw-r--r-- 1 4012 60 53565072 09 Jan 18:38 30_YEAR_912810QY7_20130109.dat
-rw-r--r-- 1 4012 60 8527412 04 Jan 18:31 2_YEAR_912828UD0_20130104.dat
-rw-r--r-- 1 4012 60 21659138 04 Jan 18:31 3_YEAR_912828UC2_20130104.dat
-rw-r--r-- 1 4012 60 91281894 04 Jan 18:34 5_YEAR_912828UE8_20130104.dat
-rw-r--r-- 1 4012 60 80421507 04 Jan 18:36 7_YEAR_912828UF5_20130104.dat
-rw-r--r-- 1 4012 60 108700356 04 Jan 18:38 10_YEAR_912828TY6_20130104.dat
-rw-r--r-- 1 4012 60 50204292 04 Jan 18:39 30_YEAR_912810QY7_20130104.dat
-rw-r--r-- 1 4012 60 2319656 07 Jan 18:24 2_YEAR_912828UD0_20130107.dat
-rw-r--r-- 1 4012 60 6978760 07 Jan 18:24 3_YEAR_912828UC2_20130107.dat
-rw-r--r-- 1 4012 60 53579177 07 Jan 18:25 5_YEAR_912828UE8_20130107.dat
-rw-r--r-- 1 4012 60 46069381 07 Jan 18:26 7_YEAR_912828UF5_20130107.dat
-rw-r--r-- 1 4012 60 70802355 07 Jan 18:28 10_YEAR_912828TY6_20130107.dat
-rw-r--r-- 1 4012 60 43050822 07 Jan 18:29 30_YEAR_912810QY7_20130107.dat
-rw-r--r-- 1 4012 60 2713906 08 Jan 18:31 2_YEAR_912828UD0_20130108.dat
-rw-r--r-- 1 4012 60 8889264 08 Jan 18:31 3_YEAR_912828UC2_20130108.dat
-rw-r--r-- 1 4012 60 63857903 08 Jan 18:32 5_YEAR_912828UE8_20130108.dat
-rw-r--r-- 1 4012 60 55544096 08 Jan 18:34 7_YEAR_912828UF5_20130108.dat
-rw-r--r-- 1 4012 60 89750161 08 Jan 18:36 10_YEAR_912828TY6_20130108.dat
-rw-r--r-- 1 4012 60 59233399 08 Jan 18:37 30_YEAR_912810QY7_20130108.dat
file.st_mtime is an integer timestamp.
dayToStr is a string.
In Python2, integers always compare less than strings for the rather arbitrary reason that the i in int comes before the s in str alphabetically:
In [123]: 1234 < 'foobar'
Out[123]: True
In Python3, comparing an int to a str raises a TypeError:
>>> 1234 < 'foobar'
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: unorderable types: int() < str()
Instead, compare datetime objects:
import datetime as DT
import os
yesterday = DT.datetime.now() - DT.timedelta(days=1)
# or, if you want 00:00 AM, yesterday:
# yesterday = DT.datetime.now().replace(hour = 0, minute = 0, second = 0, microsecond = 0) - DT.timedelta(days=1)
file_list_attr = sftp.listdir_attr()
for pfile in file_list_attr:
if DT.datetime.fromtimestamp(pfile.st_mtime) > yesterday:
print pfile
References:
datetime.fromtimestamp: This was used to convert the timestamp to a DT.datetime object.
datetime.replace: This was suggested for setting the hours, minutes, seconds (of yesterday) back to zero.
Appears to fail when comparing to 'yesterday'
for pfile in file_list_attr:
print DT.datetime.fromtimestamp(pfile.st_mtime)
2013-01-09 18:32:06
2013-01-09 18:32:22
2013-01-09 18:34:07
2013-01-09 18:35:27
2013-01-09 18:37:38
for pfile in file_list_attr:
print DT.datetime.fromtimestamp(pfile.st_mtime) > yesterday
Traceback (most recent call last):
File "<pyshell#41>", line 2, in <module>
print DT.datetime.fromtimestamp(pfile.st_mtime) > yesterday
TypeError: can't compare datetime.datetime to datetime.date
here's an example of how you can:
list all the files in a directory
Print all the files that meet the condition of been modified 24h ago
# Task: grab files ONLY modified in the past 24 hours
import os
import datetime
myPath = "/users/george/documents/"
# Adding all the files found in myFolder in a collection
fileCollection = os.listdir(myPath)
# Iterating through the files, printing their last modified date
for i in fileCollection:
# Getting the timestamp in a variable
fileModTimeStamp = os.path.getmtime(myPath + str(i))
fileModDateTime = datetime.datetime.fromtimestamp(fileModTimeStamp)
# Calculating the time delta
currentTime = datetime.datetime.now()
timeElapsed = currentTime - fileModDateTime
# 24h dimedelta
twentyFourHours = datetime.datetime(1900, 1, 2, 0, 0, 0, 0) - datetime.datetime(1900, 1, 1, 0, 0, 0, 0)
# Print the files that meet the condition
if timeElapsed <= twentyFourHours:
print "The File: " + str(i) + " Was Last Modified At: " + str(fileModDateTime) + " ,Which was about: " \
+ str(timeElapsed) + " ago."
I dont believe the os module will work as I am using paramiko to SFTP to the remote host and perform actions on the files in the directory
for filename in file_list_attr:
mtime = os.path.getmtime(filename)
print mtime
Traceback (most recent call last):
File "<pyshell#22>", line 2, in <module>
mtime = os.path.getmtime(filename)
File "U:\ActivPy\lib\genericpath.py", line 54, in getmtime
return os.stat(filename).st_mtime
TypeError: coercing to Unicode: need string or buffer, SFTPAttributes found

Best way to extract datetime from string in python

I have a script that is parsing out fields within email headers that represent dates and times. Some examples of these strings are as follows:
Fri, 10 Jun 2011 11:04:17 +0200 (CEST)
Tue, 1 Jun 2011 11:04:17 +0200
Wed, 8 Jul 1992 4:23:11 -0200
Wed, 8 Jul 1992 4:23:11 -0200 EST
Before I was confronted with the CEST/EST portions at the ends of some the strings I had things working pretty well just using datetime.datetime.strptime like this:
msg['date'] = 'Wed, 8 Jul 1992 4:23:11 -0200'
mail_date = datetime.datetime.strptime(msg['date'][:-6], '%a, %d %b %Y %H:%M:%S')
I tried to put a regex together to match the date portions of the string while excluding the timezone information at the end, but I was having issues with the regex (I couldn't match a colon).
Is using a regex the best way to parse all of the examples above? If so, could someone share a regex that would match these examples? In the end I am looking to have a datetime object.
From python time to age part 2, timezones:
from email import utils
utils.parsedate_tz('Fri, 10 Jun 2011 11:04:17 +0200 (CEST)')
utils.parsedate_tz('Fri, 10 Jun 2011 11:04:17 +0200')
utils.parsedate_tz('Fri, 10 Jun 2011 11:04:17')
The output is:
(2011, 6, 10, 11, 4, 17, 0, 1, -1, 7200)
(2011, 6, 10, 11, 4, 17, 0, 1, -1, 7200)
(2011, 6, 10, 11, 4, 17, 0, 1, -1, None)
Perhaps I misunderstood your question, but wont a simple split suffice?
#!/usr/bin/python
d = ["Fri, 10 Jun 2011 11:04:17 +0200 (CEST)", "Tue, 1 Jun 2011 11:04:17 +0200",
"Wed, 8 Jul 1992 4:23:11 -0200", "Wed, 8 Jul 1992 4:23:11 -0200 EST"]
for i in d:
print " ".join(i.split()[0:5])
Fri, 10 Jun 2011 11:04:17
Tue, 1 Jun 2011 11:04:17
Wed, 8 Jul 1992 4:23:11
Wed, 8 Jul 1992 4:23:11

Categories