Scrape website whose url doesn't change [python with beautiful soup]

Scrape website whose url doesn't change [python with beautiful soup] - python

I am totally new to web scraping.
how can i scrape a website, whose url doesn't change with the page number?
suppose take this website- https://www.bseindia.com/corporates/Forth_Results.aspx
the url doesn't change with page number,
this is same as what i am asking, how can we do it using beautiful soup in python??

This script wi
import requests
from bs4 import BeautifulSoup
url = 'https://www.bseindia.com/corporates/Forth_Results.aspx'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
page = 1
while True:
print(page)
rows = soup.select('.TTRow')
if not rows:
break
# print some data to screen:
for tr in rows:
print(tr.get_text(strip=True, separator=' '))
# to get correct page, you have to do POST request with correct data
# the data is located in <input name="..." value=".."> tags
d = {}
for i in soup.select('input'):
d[i['name']] = i.get('value', '')
# some data parameters needs to be deleted:
if 'ctl00$ContentPlaceHolder1$btnSubmit' in d:
del d['ctl00$ContentPlaceHolder1$btnSubmit']
# set correct page:
page += 1
d['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$gvData'
d['__EVENTARGUMENT'] = 'Page${}'.format(page)
soup = BeautifulSoup(requests.post(url, headers=headers, data=d).content, 'html.parser')
Prints:
1
500002 ABB 23 Jul 2020
531082 ALANKIT 23 Jul 2020
535916 ALSL 23 Jul 2020
526662 ARENTERP 23 Jul 2020
500215 ATFL 23 Jul 2020
540611 AUBANK 23 Jul 2020
532523 BIOCON 23 Jul 2020
533167 COROENGG 23 Jul 2020
532839 DISHTV 23 Jul 2020
500150 FOSECOIND 23 Jul 2020
507488 GMBREW 23 Jul 2020
532855 HARYNACAP 23 Jul 2020
541729 HDFCAMC 23 Jul 2020
524342 INDOBORAX 23 Jul 2020
522183 ITL 23 Jul 2020
534623 JUPITERIN 23 Jul 2020
533192 KCPSUGIND 23 Jul 2020
542753 MAHAANIMP 23 Jul 2020
532525 MAHABANK 23 Jul 2020
523754 MAHEPC 23 Jul 2020
531680 MAYUR 23 Jul 2020
526299 MPHASIS 23 Jul 2020
532416 NEXTMEDIA 23 Jul 2020
502294 NILACHAL 23 Jul 2020
538772 NIYOGIN 23 Jul 2020
2
530805 OIVL 23 Jul 2020
538742 PANACHE 23 Jul 2020
531879 PIONDIST 23 Jul 2020
540173 PNBHOUSING 23 Jul 2020
533178 PRADIP 23 Jul 2020
...and so on.
EDIT: To save it as CSV, you can use this:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.bseindia.com/corporates/Forth_Results.aspx'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
page = 1
all_data = []
while True:
print(page)
rows = soup.select('.TTRow')
if not rows:
break
# print some data to screen:
for tr in rows:
row = tr.get_text(strip=True, separator='|').split('|')
all_data.append(row)
# to get correct page, you have to do POST request with correct data
# the data is located in <input name="..." value=".."> tags
d = {}
for i in soup.select('input'):
d[i['name']] = i.get('value', '')
# some data parameters needs to be deleted:
if 'ctl00$ContentPlaceHolder1$btnSubmit' in d:
del d['ctl00$ContentPlaceHolder1$btnSubmit']
# set correct page:
page += 1
d['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$gvData'
d['__EVENTARGUMENT'] = 'Page${}'.format(page)
soup = BeautifulSoup(requests.post(url, headers=headers, data=d).content, 'html.parser')
df = pd.DataFrame(all_data)
print(df)
df.to_csv('data.csv')
Produces data.csv (screenshot from LibreOffice):

Related

raise ValueError(err) - Implementation of multithreading using concurrent.future in Python

I have written a python code which scrape information from a website. I tried to apply multi-thread method in my code. Here's my code before applying multithreading: It run perfectly on my PC.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import investpy
def getCurrencyHistorical():
t1 = time.perf_counter()
headers = {'Accept-Language': 'en-US,en;q=0.9',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 Edg/88.0.705.63',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive'}
links = {"USD-IDR":"https://www.investing.com/currencies/usd-idr-historical-data",
"USD-JPY":"https://www.investing.com/currencies/usd-jpy-historical-data",
"USD-CNY":"https://www.investing.com/currencies/usd-cny-historical-data"}
column = []
output = []
for key, value in links.items():
page = requests.get(value, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
table =soup.select('table')[0]
#ColumnName
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('th')
cols = [item.text.strip() for item in cols]
column.append(cols)
outs = row.find_all('td')
outs = [item.text.strip() for item in outs]
outs.append(key)
output.append(outs)
del output[0]
#print(value)
#print(output)
column[0].append('Currency')
df = pd.DataFrame(output, columns = column[0])
t2 = time.perf_counter()
print(f'Finished in {t2-t1} seconds')
return(df)
But, when I convert to below, I got some error. here's the code after applying multithreading:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import concurrent.futures
from functools import partial
import psutil
def process_data(key, page):
soup = BeautifulSoup(page, 'html.parser')
table =soup.select('table')[0]
#ColumnName
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('th')
cols = [item.text.strip() for item in cols]
outs = row.find_all('td')
outs = [item.text.strip() for item in outs]
outs.append(key)
return cols, outs
def getCurrencyHistorical(session, pool_executor, item):
key, value = item
page = session.get(value)
f = pool_executor.submit(process_data, key, page.content)
return f.result()
def main():
t1 = time.perf_counter()
links = {"USD-IDR":"https://www.investing.com/currencies/usd-idr-historical-data",
"USD-JPY":"https://www.investing.com/currencies/usd-jpy-historical-data",
"USD-CNY":"https://www.investing.com/currencies/usd-cny-historical-data"}
with requests.Session() as session:
user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.37"
session.headers = {'User-Agent': user_agent}
column = []
output = []
with concurrent.futures.ProcessPoolExecutor(psutil.cpu_count(logical=False)) as pool_executor, \
concurrent.futures.ThreadPoolExecutor(max_workers=len(links)) as executor:
for return_value in executor.map(partial(getCurrencyHistorical, session, pool_executor), links.items()):
cols, outs = return_value
column.append(cols)
output.append(outs)
del output[0]
column[0].append('Currency')
df = pd.DataFrame(output, columns = column[0])
t2 = time.perf_counter()
print(f'Finished in {t2-t1} seconds')
print(df)
# Required for Windows:
if __name__ == '__main__':
main()
I got error raise ValueError(err) from err. ValueError: 1 columns passed, passed data had 7 columns. and it comes from the line df = pd.DataFrame(output, columns = column[0]). What is wrong? Thank you.

process_data should be just like the non-multiprocessing case except for the fact it is only processing one key-value pair, but that's not what you have done. The main process now must do extend operations on the lists returned by process_data.
Update
You were not retrieving the data items for key "USD-JPY" because you were not looking at the correct table. You should be looking at the table with id 'curr_table'. I have also updated the multiprocessing pool size per my comment to your question.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import concurrent.futures
from functools import partial
from os import cpu_count
def process_data(key, page):
soup = BeautifulSoup(page, 'html.parser')
table = soup.find('table', {'id': 'curr_table'})
#ColumnName
rows = table.find_all('tr')
column = []
output = []
for row in rows:
cols = row.find_all('th')
cols = [item.text.strip() for item in cols]
column.append(cols)
outs = row.find_all('td')
outs = [item.text.strip() for item in outs]
outs.append(key)
output.append(outs)
del output[0]
return column, output
def getCurrencyHistorical(session, pool_executor, item):
key, value = item
page = session.get(value)
f = pool_executor.submit(process_data, key, page.content)
return f.result()
def main():
t1 = time.perf_counter()
links = {"USD-IDR":"https://www.investing.com/currencies/usd-idr-historical-data",
"USD-JPY":"https://www.investing.com/currencies/usd-jpy-historical-data",
"USD-CNY":"https://www.investing.com/currencies/usd-cny-historical-data"}
with requests.Session() as session:
user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.37"
session.headers = {'User-Agent': user_agent}
column = []
output = []
with concurrent.futures.ProcessPoolExecutor(min(len(links), cpu_count())) as pool_executor, \
concurrent.futures.ThreadPoolExecutor(max_workers=len(links)) as executor:
for return_value in executor.map(partial(getCurrencyHistorical, session, pool_executor), links.items()):
cols, outs = return_value
column.extend(cols)
output.extend(outs)
column[0].append('Currency')
df = pd.DataFrame(output, columns = column[0])
t2 = time.perf_counter()
print(f'Finished in {t2-t1} seconds')
pd.set_option("display.max_rows", None, "display.max_columns", None)
print(df)
# Required for Windows:
if __name__ == '__main__':
main()
Prints:
Finished in 2.1944901 seconds
Date Price Open High Low Change % Currency
0 Aug 26, 2021 14,417.5 14,425.0 14,430.0 14,411.0 0.16% USD-IDR
1 Aug 25, 2021 14,395.0 14,405.0 14,421.0 14,387.5 0.03% USD-IDR
2 Aug 24, 2021 14,390.0 14,395.0 14,407.5 14,377.5 -0.14% USD-IDR
3 Aug 23, 2021 14,410.0 14,435.0 14,438.5 14,404.0 -0.28% USD-IDR
4 Aug 20, 2021 14,450.0 14,475.0 14,485.0 14,422.5 0.35% USD-IDR
5 Aug 19, 2021 14,400.0 14,405.0 14,425.0 14,392.5 0.21% USD-IDR
6 Aug 18, 2021 14,370.0 14,387.5 14,400.0 14,372.5 0.00% USD-IDR
7 Aug 16, 2021 14,370.0 14,390.0 14,395.0 14,371.5 -0.10% USD-IDR
8 Aug 13, 2021 14,385.0 14,382.5 14,395.0 14,366.0 0.03% USD-IDR
9 Aug 12, 2021 14,380.0 14,395.0 14,407.5 14,366.0 0.00% USD-IDR
10 Aug 10, 2021 14,380.0 14,375.0 14,402.0 14,375.0 0.14% USD-IDR
11 Aug 09, 2021 14,360.0 14,370.0 14,387.5 14,357.5 0.07% USD-IDR
12 Aug 06, 2021 14,350.0 14,360.0 14,377.5 14,347.5 0.07% USD-IDR
13 Aug 05, 2021 14,340.0 14,330.0 14,360.0 14,321.0 0.21% USD-IDR
14 Aug 04, 2021 14,310.0 14,325.0 14,347.5 14,304.5 -0.21% USD-IDR
15 Aug 03, 2021 14,340.0 14,375.0 14,388.0 14,338.5 -0.55% USD-IDR
16 Aug 02, 2021 14,420.0 14,465.0 14,472.5 14,422.5 -0.28% USD-IDR
17 Jul 30, 2021 14,460.0 14,435.0 14,477.5 14,434.5 -0.14% USD-IDR
18 Jul 29, 2021 14,480.0 14,490.0 14,502.5 14,482.5 -0.03% USD-IDR
19 Jul 28, 2021 14,485.0 14,500.0 14,512.5 14,485.0 -0.03% USD-IDR
20 Jul 27, 2021 14,490.0 14,473.5 14,497.5 14,465.0 0.07% USD-IDR
21 Jul 26, 2021 14,480.0 14,510.0 14,522.5 14,470.0 -0.07% USD-IDR
22 Aug 26, 2021 110.10 109.98 110.23 109.93 0.10% USD-JPY
23 Aug 25, 2021 109.99 109.64 110.13 109.61 0.34% USD-JPY
24 Aug 24, 2021 109.62 109.69 109.89 109.41 -0.05% USD-JPY
25 Aug 23, 2021 109.68 109.81 110.15 109.65 -0.11% USD-JPY
26 Aug 20, 2021 109.80 109.75 109.89 109.57 0.07% USD-JPY
27 Aug 19, 2021 109.72 109.76 110.23 109.49 -0.02% USD-JPY
28 Aug 18, 2021 109.74 109.57 110.07 109.47 0.16% USD-JPY
29 Aug 17, 2021 109.57 109.22 109.66 109.12 0.31% USD-JPY
30 Aug 16, 2021 109.23 109.71 109.76 109.11 -0.31% USD-JPY
31 Aug 13, 2021 109.57 110.39 110.46 109.54 -0.73% USD-JPY
32 Aug 12, 2021 110.38 110.42 110.55 110.31 -0.02% USD-JPY
33 Aug 11, 2021 110.40 110.58 110.81 110.31 -0.14% USD-JPY
34 Aug 10, 2021 110.56 110.29 110.60 110.28 0.25% USD-JPY
35 Aug 09, 2021 110.28 110.26 110.36 110.02 0.03% USD-JPY
36 Aug 06, 2021 110.25 109.77 110.36 109.69 0.46% USD-JPY
37 Aug 05, 2021 109.74 109.49 109.79 109.40 0.25% USD-JPY
38 Aug 04, 2021 109.47 109.07 109.68 108.72 0.39% USD-JPY
39 Aug 03, 2021 109.04 109.32 109.36 108.88 -0.22% USD-JPY
40 Aug 02, 2021 109.28 109.69 109.79 109.18 -0.38% USD-JPY
41 Jul 30, 2021 109.70 109.49 109.83 109.36 0.22% USD-JPY
42 Jul 29, 2021 109.46 109.91 109.96 109.42 -0.40% USD-JPY
43 Jul 28, 2021 109.90 109.75 110.29 109.74 0.13% USD-JPY
44 Jul 27, 2021 109.76 110.36 110.41 109.58 -0.53% USD-JPY
45 Jul 26, 2021 110.34 110.57 110.59 110.11 -0.18% USD-JPY
46 Aug 26, 2021 6.4815 6.4725 6.4866 6.4725 0.09% USD-CNY
47 Aug 25, 2021 6.4756 6.4714 6.4811 6.4707 0.07% USD-CNY
48 Aug 24, 2021 6.4710 6.4790 6.4851 6.4676 -0.15% USD-CNY
49 Aug 23, 2021 6.4805 6.4915 6.4973 6.4788 -0.32% USD-CNY
50 Aug 20, 2021 6.5012 6.4960 6.5057 6.4935 0.11% USD-CNY
51 Aug 19, 2021 6.4942 6.4847 6.4997 6.4840 0.16% USD-CNY
52 Aug 18, 2021 6.4841 6.4861 6.4872 6.4776 -0.02% USD-CNY
53 Aug 17, 2021 6.4854 6.4787 6.4889 6.4759 0.17% USD-CNY
54 Aug 16, 2021 6.4742 6.4774 6.4810 6.4719 -0.04% USD-CNY
55 Aug 13, 2021 6.4768 6.4778 6.4854 6.4749 -0.02% USD-CNY
56 Aug 12, 2021 6.4782 6.4767 6.4811 6.4719 -0.00% USD-CNY
57 Aug 11, 2021 6.4783 6.4846 6.4894 6.4752 -0.11% USD-CNY
58 Aug 10, 2021 6.4852 6.4826 6.4875 6.4774 -0.01% USD-CNY
59 Aug 09, 2021 6.4857 6.4835 6.4895 6.4731 0.05% USD-CNY
60 Aug 06, 2021 6.4825 6.4660 6.4848 6.4622 0.34% USD-CNY
61 Aug 05, 2021 6.4608 6.4671 6.4677 6.4595 -0.07% USD-CNY
62 Aug 04, 2021 6.4655 6.4662 6.4673 6.4555 -0.07% USD-CNY
63 Aug 03, 2021 6.4700 6.4656 6.4710 6.4604 0.12% USD-CNY
64 Aug 02, 2021 6.4620 6.4615 6.4693 6.4580 0.02% USD-CNY
65 Jul 30, 2021 6.4609 6.4645 6.4693 6.4506 0.07% USD-CNY
66 Jul 29, 2021 6.4562 6.4908 6.4908 6.4544 -0.53% USD-CNY
67 Jul 28, 2021 6.4905 6.5095 6.5101 6.4891 -0.31% USD-CNY
68 Jul 27, 2021 6.5104 6.4760 6.5132 6.4735 0.43% USD-CNY
69 Jul 26, 2021 6.4825 6.4790 6.4875 6.4785 0.03% USD-CNY

Parsing a table data from BSE site into Python

I am new to python. I want to parse a data from a table in BSE site into python.
I tried using beautifulsoup module but I am unable to know which reference to use, so as to find the correct table. In fact even that particular table row is not getting displayed in python
The code that I tried was:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
page = 'https://www.bseindia.com/stock-share-price/itc-ltd/itc/500875/corp-actions/'
req = Request(page, headers = {'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
page_soup = soup(webpage, "html.parser")
containers = page_soup.findAll("table", id = "tblinsidertrd")
This is giving a blank [ ] result.
Then I tried
containers = page_soup.findAll('td')
containers = page_soup.findAll('tr)
In both results I was unable to find the table or data I was looking for. I couldn't even find the table headings viz 'EX Date' and 'Amount'
The table that I want from BSE site is highlighted below:
Please help me as to where I am going wrong and why I am unable to view the dividend table data?

The content is dynamically generated. You can pull it form the api:
import pandas as pd
import requests
url = 'https://api.bseindia.com/BseIndiaAPI/api/CorporateAction/w?scripcode=500875'
headers = {'User-Agent': 'Mozilla/5.0'}
jsonData = requests.get(url, headers=headers).json()
df = pd.DataFrame(jsonData['Table'])
Output:
print(df)
Amount BCRD_from purpose_name
0 10.15 06 Jul 2020 Dividend
1 5.75 22 May 2019 Dividend
2 5.15 25 May 2018 Dividend
3 4.75 05 Jun 2017 Dividend
4 8.50 30 May 2016 Dividend
5 6.25 03 Jun 2015 Dividend
6 6.00 03 Jun 2014 Dividend
7 5.25 31 May 2013 Dividend
8 4.50 11 Jun 2012 Dividend
9 2.80 10 Jun 2011 Dividend
10 1.65 10 Jun 2011 Special Dividend
11 10.00 09 Jun 2010 Dividend
12 3.70 13 Jul 2009 Dividend
13 3.50 16 Jul 2008 Dividend
14 3.10 16 Jul 2007 Dividend
15 10.00 03 Jul 2001 Dividend

Malformed Lambda proxy response - Python

I know this is a duplicate of multiple questions, but for some reason I've not been able to figure out how to apply those solutions to my problem. The function works fine in Lambda tests, but fails when testing it via API Gateway.
import boto3
import json
import decimal
from boto3.dynamodb.conditions import Key, Attr
def lambda_handler(event, context):
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('some_table')
result = table.scan()
response = {
"status code": 200,
"body": result["Items"]
}
return response
Here's the error log
Execution log for request
Tue Dec 31 22:47:10 UTC 2019 : Starting execution for request:
Tue Dec 31 22:47:10 UTC 2019 : HTTP Method: GET, Resource Path: /notes
Tue Dec 31 22:47:10 UTC 2019 : Method request path: {}
Tue Dec 31 22:47:10 UTC 2019 : Method request query string: {}
Tue Dec 31 22:47:10 UTC 2019 : Method request headers: {}
Tue Dec 31 22:47:10 UTC 2019 : Method request body before transformations:
Tue Dec 31 22:47:10 UTC 2019 : Endpoint request URI:
Tue Dec 31 22:47:10 UTC 2019 : Endpoint request headers: {x-amzn-lambda-integration-tag=1c231f4e-97e9-405a-aadf-ce37b34ccccd, Authorization=*****************************************************************************************************************************************************************************************************************************************************************************************************************************25519d, X-Amz-Date=20191231T224710Z, x-amzn-apigateway-api-id=4tjnqn8083, X-Amz-Source-Arn=arn:aws:execute-api:561581028295:4tjnqn8083/test-invoke-stage/GET/notes, Accept=application/json, User-Agent=AmazonAPIGateway_4tjnqn8083, X-Amz-Security-Token=IQoJb3JpZ2luX2VjEI7//////////wEaDmFwLXNvdXRoZWFzdC0yIkcwRQIgWl5Cw0aOXcxA4tBC8730wNLqnDVeo98T4+nu23F0CH8CIQCfqC5gJ6U4/UaXtHMOc1riROnwTj7AbYIKs/PCGam00irHAwj3//////////8BEAIaDDc5ODM3NjExMzg1MyIM3wb8dOuNeahpJ6o1KpsDbq4XLSkUYzoiplWuxXWlXvC3sTNceGepB4Gzgwzq8Aw4KO4tcI0GXDBjaNDCTDUpI3HMfxboA6r4v2H84VJ6YiSyIfpqRrv/2DiBortTr4iTARMBIVQb+Nc1v [TRUNCATED]
Tue Dec 31 22:47:10 UTC 2019 : Endpoint request body after transformations: {"resource":"/notes","path":"/notes","httpMethod":"GET","headers":null,"multiValueHeaders":null,"queryStringParameters":null,"multiValueQueryStringParameters":null,"pathParameters":null,"stageVariables":null,"requestContext":{"resourceId":"wb2eow","resourcePath":"/notes","httpMethod":"GET","extendedRequestId":"Fl1tQG5sywMF1tg=","requestTime":"31/Dec/2019:22:47:10 +0000","path":"/notes","accountId":"34523452346","protocol":"HTTP/1.1","stage":"test-invoke-stage","domainPrefix":"testPrefix","requestTimeEpoch":1577832430388,"requestId":"1c231f4e-97e9-405a-aadf-ce37b34ccccd","identity":{"cognitoIdentityPoolId":null,"cognitoIdentityId":null,"apiKey":"test-invoke-api-key","principalOrgId":null,"cognitoAuthenticationType":null,"userArn":"arn:aws:iam::561581028295:user/sanjay","apiKeyId":"test-invoke-api-key-id","userAgent":"aws-internal/3 aws-sdk-java/1.11.690 Linux/4.9.184-0.1.ac.235.83.329.metal1.x86_64 OpenJDK_64-Bit_Server_VM/25.232-b09 java/1.8.0_232 vendor/Oracle_Co [TRUNCATED]
Tue Dec 31 22:47:10 UTC 2019 : Sending request to https://lambda.amazonaws.com/2015-03-31/functions/arn:aws:lambda:2:562534523452345:function:listMyNote/invocations
Tue Dec 31 22:47:11 UTC 2019 : Received response. Status: 200, Integration latency: 1393 ms
Tue Dec 31 22:47:11 UTC 2019 : Endpoint response headers: {Date=Tue, 31 Dec 2019 22:47:11 GMT, Content-Type=application/json, Content-Length=118, Connection=keep-alive, x-amzn-RequestId=ac66aba1-d4c3-45ec-add3-f436cf177da9, x-amzn-Remapped-Content-Length=0, X-Amz-Executed-Version=$LATEST, X-Amzn-Trace-Id=root=1-5e0bcfee-7c8dc2fff64742d811635106;sampled=0}
Tue Dec 31 22:47:11 UTC 2019 : Endpoint response body before transformations: {"status code": 200, "body": "[{'id': '00f5fe2a-2c17-11ea-b5d9-dda84499b43e', 'text': 'Hello from the other side!'}]"}
Tue Dec 31 22:47:11 UTC 2019 : Execution failed due to configuration error: Malformed Lambda proxy response
Tue Dec 31 22:47:11 UTC 2019 : Method completed with status: 502
I've read multiple posts talking about the response body needing to be a string and to follow some kind of predefined format, but I'm not sure what I'm missing. Any help would be greatly appreciated.

It's statusCode. And you should return a string for body.
Here you go:
import json
response = {
"statusCode": 200,
"body": json.dumps(result["Items"])
}

Python gzip gives null bytes

I'm trying to parse some log files in Python, but my responses always return only null bytes.
I've confirmed that the file in question does contain data:
$ zcat Events.log.gz | wc -c
188371128
$ zcat Events.log.gz | head
17 Jan 2018 08:10:35,863: {"deviceType":"A16ZV8BU3SN1N3",[REDACTED]}
17 Jan 2018 08:10:35,878: {"deviceType":"A1CTGXB4BA274T",[REDACTED]}
17 Jan 2018 08:10:35,886: {"deviceType":"A1DL2DVDQVK3Q",[REDACTED]}
17 Jan 2018 08:10:35,911: {"deviceType":"A2CZFJ2RKY7SE2",[REDACTED]}
17 Jan 2018 08:10:35,937: {"deviceType":"A2JTEGS8GUPDOF",[REDACTED]}
17 Jan 2018 08:10:35,963: {"appOtaState":"ota",[REDACTED]}
17 Jan 2018 08:10:35,971: {"deviceType":"A1DL2DVDQVK3Q",[REDACTED]}
17 Jan 2018 08:10:36,006: {"deviceType":"A2JTEGS8GUPDOF",[REDACTED]}
17 Jan 2018 08:10:36,013: {"deviceType":"A1CTGXB4BA274T",[REDACTED]}
17 Jan 2018 08:10:36,041: {"deviceType":"A1DL2DVDQVK3Q",[REDACTED]}
But attempting to read it in Python gives only null bytes:
$ python
Python 2.6.9 (unknown, Sep 14 2016, 17:46:59)
[GCC 4.4.6 20110731 (Red Hat 4.4.6-3)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> filename = 'Events.log.gz'
>>> import gzip
>>> content = gzip.open(filename).read()
>>> len(content)
188371128
>>> for i in range(10):
... content[i*10000:(i*10000)+10]
...
'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
I've tried explicitly setting 'mode' to either 'r' or 'rb', with no difference in result.
I've also tried subprocess.Popen(['zcat', filename], stdout=subprocess.PIPE).stdout.read(), with the same response.
Perhaps relevantly, when I tried to zcat the file to another file, the output was a binary file:
$ zcat Events.log.gz > /tmp/logoutput
$ less /tmp/logoutput
"/tmp/logoutput" may be a binary file. See it anyway?
[y]
^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#^#...
$ head /tmp/logoutput
17 Jan 2018 08:10:35,863: {"deviceType":"A16ZV8BU3SN1N3",[REDACTED]}
17 Jan 2018 08:10:35,878: {"deviceType":"A1CTGXB4BA274T",[REDACTED]}
17 Jan 2018 08:10:35,886: {"deviceType":"A1DL2DVDQVK3Q",[REDACTED]}
17 Jan 2018 08:10:35,911: {"deviceType":"A2CZFJ2RKY7SE2",[REDACTED]}
17 Jan 2018 08:10:35,937: {"deviceType":"A2JTEGS8GUPDOF",[REDACTED]}
17 Jan 2018 08:10:35,963: {"appOtaState":"ota",[REDACTED]}
17 Jan 2018 08:10:35,971: {"deviceType":"A1DL2DVDQVK3Q",[REDACTED]}
17 Jan 2018 08:10:36,006: {"deviceType":"A2JTEGS8GUPDOF",[REDACTED]}
17 Jan 2018 08:10:36,013: {"deviceType":"A1CTGXB4BA274T",[REDACTED]}
17 Jan 2018 08:10:36,041: {"deviceType":"A1DL2DVDQVK3Q",[REDACTED]}

Python - Parsing a text file into a csv file

I have a text file that is output from a command that I ran with Netmiko to retrieve data from a Cisco WLC of things that are causing interference on our WiFi network. I stripped out just what I needed from the original 600k lines of code down to a couple thousand lines like this:
AP Name.......................................... 010-HIGH-FL4-AP04
Microwave Oven 11 10 -59 Mon Dec 18 08:21:23 2017
WiMax Mobile 11 0 -84 Fri Dec 15 17:09:45 2017
WiMax Fixed 11 0 -68 Tue Dec 12 09:29:30 2017
AP Name.......................................... 010-2nd-AP04
Microwave Oven 11 10 -61 Sat Dec 16 11:20:36 2017
WiMax Fixed 11 0 -78 Mon Dec 11 12:33:10 2017
AP Name.......................................... 139-FL1-AP03
Microwave Oven 6 18 -51 Fri Dec 15 12:26:56 2017
AP Name.......................................... 010-HIGH-FL3-AP04
Microwave Oven 11 10 -55 Mon Dec 18 07:51:23 2017
WiMax Mobile 11 0 -83 Wed Dec 13 16:16:26 2017
The goal is to end up with a csv file that strips out the 'AP Name ...' and puts what left on the same line as the rest of the information in the next line. The problem is some have two lines below the AP name and some have 1 or none. I have been at it for 8 hours and cannot find the best way to make this happen.
This is the latest version of code that I was trying to use, any suggestions for making this work? I just want something I can load up in excel and create a report with:
with open(outfile_name, 'w') as out_file:
with open('wlc-interference_raw.txt', 'r')as in_file:
#Variables
_ap_name = ''
_temp = ''
_flag = False
for i in in_file:
if 'AP Name' in i:
#write whatever was put in the temp file to disk because new ap now
#add another temp variable in case an ap has more than 1 interferer and check if new AP name
out_file.write(_temp)
out_file.write('\n')
#print(_temp)
_ap_name = i.lstrip('AP Name.......................................... ')
_ap_name = _ap_name.rstrip('\n')
_temp = _ap_name
#print(_temp)
elif '----' in i:
pass
elif 'Class Type' in i:
pass
else:
line_split = i.split()
for x in line_split:
_temp += ','
_temp += x
_temp += '\n'

I think your best option is to read all lines of the file, then split into sections starting with AP Name. Then you can work on parsing each section.
Example
s = """AP Name.......................................... 010-HIGH-FL4-AP04
Microwave Oven 11 10 -59 Mon Dec 18 08:21:23 2017
WiMax Mobile 11 0 -84 Fri Dec 15 17:09:45 2017
WiMax Fixed 11 0 -68 Tue Dec 12 09:29:30 2017
AP Name.......................................... 010-2nd-AP04
Microwave Oven 11 10 -61 Sat Dec 16 11:20:36 2017
WiMax Fixed 11 0 -78 Mon Dec 11 12:33:10 2017
AP Name.......................................... 139-FL1-AP03
Microwave Oven 6 18 -51 Fri Dec 15 12:26:56 2017
AP Name.......................................... 010-HIGH-FL3-AP04
Microwave Oven 11 10 -55 Mon Dec 18 07:51:23 2017
WiMax Mobile 11 0 -83 Wed Dec 13 16:16:26 2017"""
import re
class AP:
"""
A class holding each section of the parsed file
"""
def __init__(self):
self.header = ""
self.content = []
sections = []
section = None
for line in s.split('\n'): # Or 'for line in file:'
# Starting new section
if line.startswith('AP Name'):
# If previously had a section, add to list
if section is not None:
sections.append(section)
section = AP()
section.header = line
else:
if section is not None:
section.content.append(line)
sections.append(section) # Add last section outside of loop
for section in sections:
ap_name = section.header.lstrip("AP Name.") # lstrip takes all the characters given, not a literal string
for line in section.content:
print(ap_name + ",", end="")
# You can extract the date separately, if needed
# Splitting on more than one space using a regex
line = ",".join(re.split(r'\s\s+', line))
print(line.rstrip(',')) # Remove trailing comma from imperfect split
Output
010-HIGH-FL4-AP04,Microwave Oven,11,10,-59,Mon Dec 18 08:21:23 2017
010-HIGH-FL4-AP04,WiMax Mobile,11,0,-84,Fri Dec 15 17:09:45 2017
010-HIGH-FL4-AP04,WiMax Fixed,11,0,-68,Tue Dec 12 09:29:30 2017
010-2nd-AP04,Microwave Oven,11,10,-61,Sat Dec 16 11:20:36 2017
010-2nd-AP04,WiMax Fixed,11,0,-78,Mon Dec 11 12:33:10 2017
139-FL1-AP03,Microwave Oven,6,18,-51,Fri Dec 15 12:26:56 2017
010-HIGH-FL3-AP04,Microwave Oven,11,10,-55,Mon Dec 18 07:51:23 2017
010-HIGH-FL3-AP04,WiMax Mobile,11,0,-83,Wed Dec 13 16:16:26 2017
Tip:
You don't need Python to write the CSV, you can output to a file using the command line
python script.py > output.csv

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrape website whose url doesn't change [python with beautiful soup] - python

Related

raise ValueError(err) - Implementation of multithreading using concurrent.future in Python

Parsing a table data from BSE site into Python

Malformed Lambda proxy response - Python

Python gzip gives null bytes

Python - Parsing a text file into a csv file

Categories

Resources