Webscraping to excel

Webscraping to excel - python

looking for help I have exhausted my limited ability. The below code gives me everything I need except there are two embedded field 'sectionaltimes' and 'splittimes' which I also need to be included in the dataframe and therefore exported to excel as individual components rather than as a long string.
import requests
import json
import pandas as pd
import xlsxwriter
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0',
'Accept-Language' : 'en-US,en;q=0.5'}
df=pd.DataFrame()
for race in range(1, 9):
url = f"https://s3-ap-southeast-2.amazonaws.com/racevic.static/2015-01-01/flemington/sectionaltimes/race-{race}.json?callback=sectionaltimes_callback"
r = requests.get(url, headers=headers)
json_obj = json.loads(r.text.split('sectionaltimes_callback(')[1].rsplit(')', 1)[0])
main_df=pd.DataFrame(json_obj['Horses'])
df=pd.concat([df,main_df])
df.reset_index(drop=True, inplace=True)
df.to_excel("20150101.xlsx")

Related

Data ino pd.DataFrame

from datetime import timedelta, date
from nsepy import get_history
import pandas as pd
def importdata(stock):
stock_fut = get_history(symbol=stock,
start=date.today() - timedelta(days = 3), end=date.today(),
futures=True,
expiry_date=date(2022,9,29))
#print(stock_fut.columns)
print(stock_fut[["Symbol","Number of Contracts","Change in OI","Open Interest"]])
a = ["ULTRACEMCO"]
#a = ["CONCOR", "JKCEMENT","SHREECEM","RAMCOCEM","INDIGO","ACC","BAJAJ-AUTO","ULTRACEMCO","PERSISTENT","MARUTI"]
for i in range(0,len(a)):
#print(a[i])
#importdata(a[i])
df = pd.DataFrame(a[i])
print(df)
Unable to do same with error as DataFrame is not called properly.
Also I want to join all data for all symbols in single table.
import requests
import json
import codecs
import pandas as pd
baseurl = "https://www.nseindia.com/"
url = f'https://www.nseindia.com/api/live-analysis-oi-spurts-underlyings'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, ''like Gecko) ''Chrome/80.0.3987.149 Safari/537.36','accept-language': 'en,gu;q=0.9,hi;q=0.8', 'accept-encoding': 'gzip, deflate, br'}
session = requests.Session()
request = session.get(baseurl, headers=headers, timeout=30)
cookies = dict(request.cookies)
res = session.get(url, headers=headers, timeout=30, cookies=cookies)
df = pd.DataFrame(json.loads(codecs.decode(bytes(res.text, 'utf-8'), 'utf-8-sig'))['data'])
mini_df = df[['symbol']]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
#print(df)
#print(mini_df)
print(mini_df.to_string(index=False))
Sir what if I want this symbol output to give input in below code for value of "a".

I tried to fix your code with minimal modification. Hope it helps;
from datetime import timedelta, date
from nsepy import get_history
import pandas as pd
import requests
import json
import codecs
baseurl = "https://www.nseindia.com/"
url = f'https://www.nseindia.com/api/live-analysis-oi-spurts-underlyings'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, ''like Gecko) ''Chrome/80.0.3987.149 Safari/537.36','accept-language': 'en,gu;q=0.9,hi;q=0.8', 'accept-encoding': 'gzip, deflate, br'}
session = requests.Session()
request = session.get(baseurl, headers=headers, timeout=30)
cookies = dict(request.cookies)
res = session.get(url, headers=headers, timeout=30, cookies=cookies)
# hereby we use from_dict module of pandas.DataFrame in order to make a dataframe from a collection of dictionaries inside a dictionary.
df1 = pd.DataFrame.from_dict(json.loads(codecs.decode(bytes(res.text, 'utf-8'), 'utf-8-sig'))['data'])
# and here we get a list unique of symbols
a = df1.symbol.unique().tolist()
def importdata(stock):
stock_fut = get_history(symbol=stock,
start=date.today() - timedelta(days = 3), end=date.today(),
futures=True,
expiry_date=date(2022,9,29))
return stock_fut[["Symbol","Number of Contracts","Change in OI","Open Interest"]]
# here we add all the dataframes to a list
df_list = []
for i in a:
temp = importdata(i)
temp_df = pd.DataFrame(temp)
df_list.append(temp_df)
# and here we concatenate all of them together in a row-wise manner.
df = pd.concat(df_list)
print (df)

Change the line to:
df = pd.DataFrame([a[i]])

Auto-Adjust the Width of Excel Columns with Pandas

I am trying that my columns will auto-adjust according to the data but they will provide me an error is there a possible way that my data are fit in the excel file they show me this error ValueError: Shape of passed values is (1, 12), indices imply (1, 1) is there any way to solve these error kindly check it if there is any visible solution for it
indices imply (1, 1)` is there any way to solve these error kindly check it
import enum
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import numpy as np
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
r = requests.get("https://www.fleetpride.com/parts/otr-coiled-air-hose-otr6818")
soup = BeautifulSoup(r.content, "html5lib")
raw_json = ""
for table_index,table in enumerate( soup.find_all("script")):
if('CCRZ.detailData.jsonProductData = {"' in str(table)):
x=str(table).split('CCRZ.detailData.jsonProductData = {"')
raw_json = "{\""+str(x[-1]).split('};')[0]+"}"
break
req_json = json.loads(raw_json)
# with open("text_json.json","w")as file:
# x=json.dump(req_json,file,indent=4)
temp = req_json
cat=temp['product']['prodBean']['friendlyUrl'][:11]
catu=temp['product']['prodBean']['friendlyUrl'][11:55]
catk=temp['product']['prodBean']['friendlyUrl'][56:71]
cup=temp['product']['prodBean']['friendlyUrl'][72:]
title=temp['product']['prodBean']['name']
specification=temp['product']['prodBean']['sku']
spec1=temp['product']['prodBean']['productSpecsS'][15]['specValue']
spec2=temp['product']['prodBean']['productSpecsS'][30]['specValue']
spec3=temp['product']['prodBean']['productSpecsS'][28]['specValue']
spec4=temp['product']['prodBean']['productSpecsS'][29]['specValue']
spec5=temp['product']['prodBean']['productSpecsS'][27]['specValue']
spec6=temp['product']['prodBean']['productSpecsS'][18]['specValue']
spec7=temp['product']['prodBean']['productSpecsS'][19]['specValue']
spec8=temp['product']['prodBean']['productSpecsS'][20]['specValue']
fea=spec6+spec7+spec8
spec11=temp['product']['prodBean']['ECrossReferencesS'][0]['Interchange_Part_Number__c']
spec12=temp['product']['prodBean']['ECrossReferencesS'][1]['Interchange_Part_Number__c']
spec13=temp['product']['prodBean']['ECrossReferencesS'][2]['Interchange_Part_Number__c']
spec14=temp['product']['prodBean']['ECrossReferencesS'][3]['Interchange_Part_Number__c']
spec15=temp['product']['prodBean']['ECrossReferencesS'][4]['Interchange_Part_Number__c']
spec16=temp['product']['prodBean']['ECrossReferencesS'][5]['Interchange_Part_Number__c']
cross=spec11+spec12+spec13+spec14+spec15+spec16
wev=[]
web={
'category':cat,
'sub_category':catu,
'sub_category1':catk,
'sub_category2':cup,
'name':title,
'Model_No':specification,
'VMRS':spec1,
'width_each':spec2,
'Quantity':spec3,
'Height_each':spec4,
'cross_reference':cross,
'feature':fea
}
# print(web)
wev.append(web)
df = pd.DataFrame(np.random.randint(0,100,size=(1, 12)),columns=wev)
# print(df)
df.to_csv('second.csv', index=False, encoding='utf-8')

Well, the following code should work:
writer = pd.ExcelWriter('test_file.xlsx')
df.to_excel(writer, sheet_name='test_sheet', index=False, na_rep='NaN')
for column in df:
column_width = max(df[column].astype(str).map(len).max(), len(column))
col_idx = df.columns.get_loc(column)
writer.sheets['test_sheet'].set_column(col_idx, col_idx, column_width)
writer.save()

Printing Text Scraped Using BeautifulSoup to Pandas Dataframe without Tags

I have been working on the code below and getting myself tied up in knots. What I am trying to do is build a simple dataframe using text scraped using BeautifulSoup.
I have scraped the applicable text from the <h5> and <p> tags but using find_all means that when I build the dataframe and write to csv the tags are included. To deal with this I have added the print(p.text, end=" ") statements but now nothing is being written to the csv.
Can anyone see what I am doing wrong?
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}
course = []
runner = []
page = requests.get('https://www.attheraces.com/tips/atr-tipsters/hugh-taylor', headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
tips = soup.find('div', class_='sticky')
for h5 in tips.find_all("h5"):
course_name = print(h5.text, end=" ")
course.append(course_name)
for p in tips.find_all("p"):
runner_name = print(p.text, end=" ")
runner.append(runner_name)
todays_tips = pd.DataFrame(
{'Course': course,
'Selection': runner,
})
print(todays_tips)
todays_tips.to_csv(r'C:\Users\*****\Today.csv')

Don't use the assignment for print and consider using a list comprehension. Applying this should get you the dataframe you want.
For example:
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}
page = requests.get('https://www.attheraces.com/tips/atr-tipsters/hugh-taylor', headers=headers)
tips = BeautifulSoup(page.content, 'html.parser').find('div', class_='sticky')
course = [h5.getText() for h5 in tips.find_all("h5")]
runner = [p.getText() for p in tips.find_all("p")]
todays_tips = pd.DataFrame({'Course': course, 'Selection': runner})
print(todays_tips)
todays_tips.to_csv("your_data.csv", index=False)
Output:
Course Selection
0 1.00 HAYDOCK 1pt win RAINBOW JET (12-1 & 11-1 general)
1 2.50 GOODWOOD 1pt win MARSABIT (11-2 general)
And a .csv file:

Python :ValueError: No JSON object could be decoded

I am accessing a json data and want to convert it in pandas dataframe.
Unfortunately, an error occurred when json.loads(req.text)
ValueError: No JSON object could be decoded
Below is my code.
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
"Origin": "https://www.idx.co.id"}
req = requests.get("https://www.idx.co.id/Portals/0/StaticData/HomeHtml/data.js",
headers=HEADERS)
stocks = json.loads(req.text)
columns = ['code', 'name']
df = pd.DataFrame([{k: v for k,v in d.items() if k in columns}
for d in stocks, columns = columns)

You are not actually receiving a JSON, but a Javascript file. Applying a simple regular expression matching all the data between [] you can achieve the desired result.
import requests
import json
import re
req = requests.get("https://www.idx.co.id/Portals/0/StaticData/HomeHtml/data.js")
content = re.findall(r"= (\[.*?\]);", req.text)
data = json.loads(content[0])
print(data)
Edit: an useful website to test python regexp is https://pythex.org/

Scraper only prints last page data instead of all pages - BS4

I am scraping Trustpilot reviews, but data keeps getting overwritten with each iteration. How can I make it append all data from all pages instead of just the last one?
import re
import requests
import pandas as pd
from openpyxl import load_workbook
from bs4 import BeautifulSoup
def get_total_items(url):
soup = BeautifulSoup(requests.get(url, format(0),headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"}).text, 'lxml')
stars = []
star1 = soup.find_all(attrs={"star-rating star-rating--medium"})
stars.append(star1)
df = pd.DataFrame(stars, ["Rating"])
return df
ddf = []
for i in range(29):
urls = "https://www.trustpilot.com/review/www.pandora.net?page={}"
get_total_items(urls).append(ddf)
print(ddf)

Change the for loop like below:
for i in range(29):
urls = "https://www.trustpilot.com/review/www.pandora.net?page={}"
ddf.append(get_total_items(urls.format(i)))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Webscraping to excel - python

Related

Data ino pd.DataFrame

Auto-Adjust the Width of Excel Columns with Pandas

Printing Text Scraped Using BeautifulSoup to Pandas Dataframe without Tags

Python :ValueError: No JSON object could be decoded

Scraper only prints last page data instead of all pages - BS4

Categories

Resources