looking for help I have exhausted my limited ability. The below code gives me everything I need except there are two embedded field 'sectionaltimes' and 'splittimes' which I also need to be included in the dataframe and therefore exported to excel as individual components rather than as a long string.
import requests
import json
import pandas as pd
import xlsxwriter
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0',
'Accept-Language' : 'en-US,en;q=0.5'}
df=pd.DataFrame()
for race in range(1, 9):
url = f"https://s3-ap-southeast-2.amazonaws.com/racevic.static/2015-01-01/flemington/sectionaltimes/race-{race}.json?callback=sectionaltimes_callback"
r = requests.get(url, headers=headers)
json_obj = json.loads(r.text.split('sectionaltimes_callback(')[1].rsplit(')', 1)[0])
main_df=pd.DataFrame(json_obj['Horses'])
df=pd.concat([df,main_df])
df.reset_index(drop=True, inplace=True)
df.to_excel("20150101.xlsx")
I want to scrape this URL https://aviation-safety.net/wikibase/type/C206.
I don't understand the meaning of this error below:
'NoneType' object has no attribute 'prettify'
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://aviation-safety.net/wikibase/type/C206'
req = Request(url , headers = {
'accept':'*/*',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
data = []
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
data.append(pd.read_html(soup.select_one('tbody').prettify())[0])
if soup.select_one('div.pagenumbers + div a[href]'):
url = soup.select_one('div.pagenumbers + div a')['href']
else:
break
df = pd.concat(data)
df.to_csv('206.csv',encoding='utf-8-sig',index=False)
You're not using headers with requests, which is the reason you're not getting the right HTML and the table you're after is the second one, not the first. Also, I'd highly recommend to use requests over urllib.request.
So, having said that, here's how to get all the tables from all the pages:
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = 'https://aviation-safety.net/wikibase/type/C206'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
}
data = []
with requests.Session() as s:
total_pages = int(
BeautifulSoup(s.get(url, headers=headers).text, "lxml")
.select("div.pagenumbers > a")[-1]
.getText()
)
for page in range(1, total_pages + 1):
print(f"Getting page: {page}...")
data.append(
pd.read_html(
s.get(f"{url}/{page}", headers=headers).text,
flavor="lxml",
)[1]
)
df = pd.concat(data)
df.to_csv('206.csv', sep=";", index=False)
I want the data in 'data frame' the code is working perfectly please solve these issue and provide data in Data Frame I try to solve it but faliure to do these
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
browser = webdriver.Chrome('F:\chromedriver.exe')
browser.get("https://capitalonebank2.bluematrix.com/sellside/Disclosures.action")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"}
for title in browser.find_elements_by_css_selector('option'):
title.click()
time.sleep(1)
browser.switch_to.frame(browser.find_elements_by_css_selector("iframe")[1])
table = browser.find_element_by_css_selector("table table")
soup = BeautifulSoup(table.get_attribute("innerHTML"), "lxml")
all_data = []
ratings = {"BUY":[], "HOLD":[], "SELL":[]}
lists_ = []
for row in soup.select("tr")[-4:-1]:
info_list = row.select("td")
count = info_list[1].text
percent = info_list[2].text
IBServ_count = info_list[4].text
IBServ_percent = info_list[5].text
lists_.append([count, percent, IBServ_count, IBServ_percent])
ratings["BUY"] = lists_[0]
ratings["HOLD"] = lists_[1]
ratings["SELL"] = lists_[2]
You can find the solution below:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
browser = webdriver.Chrome('F:\chromedriver.exe')
browser.get("https://capitalonebank2.bluematrix.com/sellside/Disclosures.action")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"}
title_lists = []
buy_lists = []
hold_lists = []
sell_lists = []
for title in browser.find_elements_by_css_selector('option'):
title.click()
time.sleep(1)
title_lists.append(title.text)
browser.switch_to.frame(browser.find_elements_by_css_selector("iframe")[1])
table = browser.find_element_by_css_selector("table table")
soup = BeautifulSoup(table.get_attribute("innerHTML"), "lxml")
lists_ = []
for row in soup.select("tr")[-4:-1]:
info_list = row.select("td")
count = info_list[1].text
percent = info_list[2].text
IBServ_count = info_list[4].text
IBServ_percent = info_list[5].text
lists_.append([count, percent, IBServ_count, IBServ_percent])
buy_lists.append(lists_[0])
hold_lists.append(lists_[1])
sell_lists.append(lists_[2])
browser.switch_to.default_content()
header = pd.MultiIndex.from_product([['BUY','HOLD', 'SELL'],
['Count','Percent','IBServ_count', 'IBServ_percent']],names=['Action','Rating'])
m = np.array([[i[0] for i in buy_lists], [i[1] for i in buy_lists], [i[2] for i in buy_lists], [i[3] for i in buy_lists],
[i[0] for i in hold_lists], [i[1] for i in hold_lists], [i[2] for i in hold_lists], [i[3] for i in hold_lists],
[i[0] for i in sell_lists], [i[1] for i in sell_lists], [i[2] for i in sell_lists], [i[3] for i in sell_lists]])
dc = pd.DataFrame(np.rot90(m),columns = header)
dc["Title"] = title_lists
dc = dc.set_index("Title")
dc.to_csv('out.csv', index = 0)
I used numpy to manipulate the array of information I created. It may not be the best usage of pandas but it creates the DataFrame you are looking for.
You can do this :
data = {
'Details': lists_
}
df = pd.DataFrame.from_dict(data)
df.to_csv('out.csv', index = 0)
you should write this outside the for loop that you've.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt,time
import os
from pathlib import Path
import pandas as pd
import numpy as np
import datetime as dt
from pathlib import Path
df= ['ACC', 'ADANIENT', 'ADANIPORTS', 'AMARAJABAT', 'AMBUJACEM', 'APOLLOHOSP', 'APOLLOTYRE', 'ASHOKLEY', 'ASIANPAINT', 'AUROPHARMA', 'AXISBANK', 'BAJAJ-AUTO', 'BAJAJFINSV', 'BAJFINANCE', 'BALKRISIND', 'BANDHANBNK', 'BANKBARODA',
'BATAINDIA', 'BEL', 'BERGEPAINT', 'BHARATFORG', 'BHARTIARTL', 'BHEL', 'BIOCON', 'BOSCHLTD', 'BPCL', 'BRITANNIA','CADILAHC', 'CANBK', 'CHOLAFIN', 'CIPLA', 'COALINDIA', 'COFORGE', 'COLPAL', 'CONCOR', 'CUMMINSIND', 'DABUR', 'DIVISLAB', 'DLF', 'DRREDDY', 'EICHERMOT', 'ESCORTS', 'EXIDEIND', 'FEDERALBNK', 'GAIL', 'GLENMARK', 'GMRINFRA', 'GODREJCP', 'GODREJPROP', 'GRASIM', 'HAVELLS', 'HCLTECH', 'HDFC', 'HDFCBANK', 'HDFCLIFE', 'HEROMOTOCO', 'HINDALCO', 'HINDPETRO', 'HINDUNILVR', 'IBULHSGFIN', 'ICICIBANK', 'ICICIPRULI', 'IDEA', 'IDFCFIRSTB', 'IGL',
'INDIGO','INDUSINDBK','INFRATEL','INFY','IOC','ITC', 'JINDALSTEL', 'JSWSTEEL', 'JUBLFOOD', 'KOTAKBANK', 'L&TFH', 'LICHSGFIN', 'LT', 'LUPIN', 'M&M', 'M&MFIN', 'MANAPPURAM', 'MARICO','MARUTI','MCDOWELL-N','MFSL','MGL', 'MINDTREE','MOTHERSUMI', 'MRF', 'MUTHOOTFIN', 'NATIONALUM', 'NAUKRI','NESTLEIND','NMDC', 'NTPC','ONGC', 'PAGEIND','PEL', 'PETRONET',
'PFC', 'PIDILITIND','PNB', 'POWERGRID','PVR', 'RAMCOCEM','RBLBANK', 'RECLTD','RELIANCE', 'SAIL','SBILIFE', 'SBIN','SHREECEM','SIEMENS','SRF','SRTRANSFIN','SUNPHARMA','SUNTV','TATACHEM', 'TATACONSUM','TATAMOTORS', 'TATAPOWER','TATASTEEL', 'TCS','TECHM', 'TITAN','TORNTPHARM', 'TORNTPOWER', 'TVSMOTOR', 'UBL','ULTRACEMCO','UPL', 'VEDL', 'VOLTAS','WIPRO', 'ZEEL']
for k in df:
print(k)
k1=k
k=k.replace('&','%26')
url = "https://www.nseindia.com/api/option-chain-equities?symbol="+str(k)
# headers = {"user-agent": "Mozilla/5.0"}
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) chrome/80.0.3987.132 Safari/537.36','Accept-Language': 'en-US,en;q=0.9','Accept-Encoding': 'gzip, deflate'}
response = requests.get(url, headers=headers)
response.raise_for_status()
data1 = response.json()
if not data1:
print("empty # "+ k)
else:
data=data1["records"]["data"]
columns = pd.DataFrame(data[0]).transpose().columns
final_data=pd.DataFrame(columns=columns)
for i in range(0,len(data)):
z=pd.DataFrame(data[i])
zz=z.transpose()
if(len(zz.index)==3):
final_data=final_data.append(zz.loc[[zz.index[2]]])
elif(len(zz.index)==4):
final_data=final_data.append(zz.loc[[zz.index[2]]])
final_data=final_data.append(zz.loc[[zz.index[3]]])
final_data=final_data.reset_index()
final_data=final_data.rename(columns = {'index':'option_type'})
final_data=final_data[(final_data['openInterest']>0)&(final_data['changeinOpenInterest']>0)].reset_index(drop=True)
path=Path('E:\code and finance\Data\FNO\Equity\options',k1,(dt.datetime.today()).strftime("%d-%m-%Y")+".xlsx")
final_data.to_excel(path)
I was using the code to get the data but recently i started getting HTTPError: 401 Client Error: when accessing a website by python but not by chrome browser. What should i do to access the website like i sed to in python.
The link work in chrome but not in python
I have a link as below:
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?segmentLink=17&instrument=OPTIDX&symbol=BANKNIFTY&date=9JAN2020"
I want to collect all the Expiry Date available as per the image below:
My Code:
########################
import pandas as pd
from requests import Session
import os, time, sys
from datetime import datetime
s = Session()
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '\
'AppleWebKit/537.36 (KHTML, like Gecko) '\
'Chrome/75.0.3770.80 Safari/537.36'}
# Add headers
s.headers.update(headers)
URL = 'https://www.nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp'
params = {'symbolCode':9999,'symbol':'BANKNIFTY','instrument': '-','date': '9JAN2020','segmentLink': 17}
res = s.get(URL, params=params)
df1 = pd.read_html(res.content)[0]
df2 = pd.read_html(res.content)[1]
Not able to get the values in df1 nor df2
It needs minimal knowlege of requests and BeautifulSoup or lxml
import requests
import lxml.html
url = 'https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?segmentLink=17&instrument=OPTIDX&symbol=BANKNIFTY&date=9JAN2020'
r = requests.get(url)
soup = lxml.html.fromstring(r.text)
items = soup.xpath('//form[#id="ocForm"]//option/text()')
print(items)
Result
[' Select ', '9JAN2020', '16JAN2020', '23JAN2020', '30JAN2020', '6FEB2020', '13FEB2020', '20FEB2020', '27FEB2020', '5MAR2020', '26MAR2020']
import pandas as pd
from requests import Session
import lxml.html
s = Session()
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '\
'AppleWebKit/537.36 (KHTML, like Gecko) '\
'Chrome/75.0.3770.80 Safari/537.36'}
# Add headers
s.headers.update(headers)
URL = 'https://www.nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp'
params = {'symbolCode':9999,'symbol':'BANKNIFTY','instrument': 'OPTIDX','date': '-','segmentLink': 17}
res = s.get(URL, params=params)
soup = lxml.html.fromstring(res.text)
items = soup.xpath('//form[#id="ocForm"]//option/text()')
print(items)
text = pd.read_html(res.content)[0].loc[0, 1]
print(text)