Data ino pd.DataFrame - python

from datetime import timedelta, date
from nsepy import get_history
import pandas as pd
def importdata(stock):
stock_fut = get_history(symbol=stock,
start=date.today() - timedelta(days = 3), end=date.today(),
futures=True,
expiry_date=date(2022,9,29))
#print(stock_fut.columns)
print(stock_fut[["Symbol","Number of Contracts","Change in OI","Open Interest"]])
a = ["ULTRACEMCO"]
#a = ["CONCOR", "JKCEMENT","SHREECEM","RAMCOCEM","INDIGO","ACC","BAJAJ-AUTO","ULTRACEMCO","PERSISTENT","MARUTI"]
for i in range(0,len(a)):
#print(a[i])
#importdata(a[i])
df = pd.DataFrame(a[i])
print(df)
Unable to do same with error as DataFrame is not called properly.
Also I want to join all data for all symbols in single table.
import requests
import json
import codecs
import pandas as pd
baseurl = "https://www.nseindia.com/"
url = f'https://www.nseindia.com/api/live-analysis-oi-spurts-underlyings'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, ''like Gecko) ''Chrome/80.0.3987.149 Safari/537.36','accept-language': 'en,gu;q=0.9,hi;q=0.8', 'accept-encoding': 'gzip, deflate, br'}
session = requests.Session()
request = session.get(baseurl, headers=headers, timeout=30)
cookies = dict(request.cookies)
res = session.get(url, headers=headers, timeout=30, cookies=cookies)
df = pd.DataFrame(json.loads(codecs.decode(bytes(res.text, 'utf-8'), 'utf-8-sig'))['data'])
mini_df = df[['symbol']]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
#print(df)
#print(mini_df)
print(mini_df.to_string(index=False))
Sir what if I want this symbol output to give input in below code for value of "a".

I tried to fix your code with minimal modification. Hope it helps;
from datetime import timedelta, date
from nsepy import get_history
import pandas as pd
import requests
import json
import codecs
baseurl = "https://www.nseindia.com/"
url = f'https://www.nseindia.com/api/live-analysis-oi-spurts-underlyings'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, ''like Gecko) ''Chrome/80.0.3987.149 Safari/537.36','accept-language': 'en,gu;q=0.9,hi;q=0.8', 'accept-encoding': 'gzip, deflate, br'}
session = requests.Session()
request = session.get(baseurl, headers=headers, timeout=30)
cookies = dict(request.cookies)
res = session.get(url, headers=headers, timeout=30, cookies=cookies)
# hereby we use from_dict module of pandas.DataFrame in order to make a dataframe from a collection of dictionaries inside a dictionary.
df1 = pd.DataFrame.from_dict(json.loads(codecs.decode(bytes(res.text, 'utf-8'), 'utf-8-sig'))['data'])
# and here we get a list unique of symbols
a = df1.symbol.unique().tolist()
def importdata(stock):
stock_fut = get_history(symbol=stock,
start=date.today() - timedelta(days = 3), end=date.today(),
futures=True,
expiry_date=date(2022,9,29))
return stock_fut[["Symbol","Number of Contracts","Change in OI","Open Interest"]]
# here we add all the dataframes to a list
df_list = []
for i in a:
temp = importdata(i)
temp_df = pd.DataFrame(temp)
df_list.append(temp_df)
# and here we concatenate all of them together in a row-wise manner.
df = pd.concat(df_list)
print (df)

Change the line to:
df = pd.DataFrame([a[i]])

Related

Webscraping to excel

looking for help I have exhausted my limited ability. The below code gives me everything I need except there are two embedded field 'sectionaltimes' and 'splittimes' which I also need to be included in the dataframe and therefore exported to excel as individual components rather than as a long string.
import requests
import json
import pandas as pd
import xlsxwriter
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0',
'Accept-Language' : 'en-US,en;q=0.5'}
df=pd.DataFrame()
for race in range(1, 9):
url = f"https://s3-ap-southeast-2.amazonaws.com/racevic.static/2015-01-01/flemington/sectionaltimes/race-{race}.json?callback=sectionaltimes_callback"
r = requests.get(url, headers=headers)
json_obj = json.loads(r.text.split('sectionaltimes_callback(')[1].rsplit(')', 1)[0])
main_df=pd.DataFrame(json_obj['Horses'])
df=pd.concat([df,main_df])
df.reset_index(drop=True, inplace=True)
df.to_excel("20150101.xlsx")

What is the fix for this Error: 'NoneType' object has no attribute 'prettify'

I want to scrape this URL https://aviation-safety.net/wikibase/type/C206.
I don't understand the meaning of this error below:
'NoneType' object has no attribute 'prettify'
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://aviation-safety.net/wikibase/type/C206'
req = Request(url , headers = {
'accept':'*/*',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
data = []
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
data.append(pd.read_html(soup.select_one('tbody').prettify())[0])
if soup.select_one('div.pagenumbers + div a[href]'):
url = soup.select_one('div.pagenumbers + div a')['href']
else:
break
df = pd.concat(data)
df.to_csv('206.csv',encoding='utf-8-sig',index=False)
You're not using headers with requests, which is the reason you're not getting the right HTML and the table you're after is the second one, not the first. Also, I'd highly recommend to use requests over urllib.request.
So, having said that, here's how to get all the tables from all the pages:
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = 'https://aviation-safety.net/wikibase/type/C206'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
}
data = []
with requests.Session() as s:
total_pages = int(
BeautifulSoup(s.get(url, headers=headers).text, "lxml")
.select("div.pagenumbers > a")[-1]
.getText()
)
for page in range(1, total_pages + 1):
print(f"Getting page: {page}...")
data.append(
pd.read_html(
s.get(f"{url}/{page}", headers=headers).text,
flavor="lxml",
)[1]
)
df = pd.concat(data)
df.to_csv('206.csv', sep=";", index=False)

I want the data in 'data frame' the code is working perfectly

I want the data in 'data frame' the code is working perfectly please solve these issue and provide data in Data Frame I try to solve it but faliure to do these
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
browser = webdriver.Chrome('F:\chromedriver.exe')
browser.get("https://capitalonebank2.bluematrix.com/sellside/Disclosures.action")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"}
for title in browser.find_elements_by_css_selector('option'):
title.click()
time.sleep(1)
browser.switch_to.frame(browser.find_elements_by_css_selector("iframe")[1])
table = browser.find_element_by_css_selector("table table")
soup = BeautifulSoup(table.get_attribute("innerHTML"), "lxml")
all_data = []
ratings = {"BUY":[], "HOLD":[], "SELL":[]}
lists_ = []
for row in soup.select("tr")[-4:-1]:
info_list = row.select("td")
count = info_list[1].text
percent = info_list[2].text
IBServ_count = info_list[4].text
IBServ_percent = info_list[5].text
lists_.append([count, percent, IBServ_count, IBServ_percent])
ratings["BUY"] = lists_[0]
ratings["HOLD"] = lists_[1]
ratings["SELL"] = lists_[2]
You can find the solution below:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
browser = webdriver.Chrome('F:\chromedriver.exe')
browser.get("https://capitalonebank2.bluematrix.com/sellside/Disclosures.action")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"}
title_lists = []
buy_lists = []
hold_lists = []
sell_lists = []
for title in browser.find_elements_by_css_selector('option'):
title.click()
time.sleep(1)
title_lists.append(title.text)
browser.switch_to.frame(browser.find_elements_by_css_selector("iframe")[1])
table = browser.find_element_by_css_selector("table table")
soup = BeautifulSoup(table.get_attribute("innerHTML"), "lxml")
lists_ = []
for row in soup.select("tr")[-4:-1]:
info_list = row.select("td")
count = info_list[1].text
percent = info_list[2].text
IBServ_count = info_list[4].text
IBServ_percent = info_list[5].text
lists_.append([count, percent, IBServ_count, IBServ_percent])
buy_lists.append(lists_[0])
hold_lists.append(lists_[1])
sell_lists.append(lists_[2])
browser.switch_to.default_content()
header = pd.MultiIndex.from_product([['BUY','HOLD', 'SELL'],
['Count','Percent','IBServ_count', 'IBServ_percent']],names=['Action','Rating'])
m = np.array([[i[0] for i in buy_lists], [i[1] for i in buy_lists], [i[2] for i in buy_lists], [i[3] for i in buy_lists],
[i[0] for i in hold_lists], [i[1] for i in hold_lists], [i[2] for i in hold_lists], [i[3] for i in hold_lists],
[i[0] for i in sell_lists], [i[1] for i in sell_lists], [i[2] for i in sell_lists], [i[3] for i in sell_lists]])
dc = pd.DataFrame(np.rot90(m),columns = header)
dc["Title"] = title_lists
dc = dc.set_index("Title")
dc.to_csv('out.csv', index = 0)
I used numpy to manipulate the array of information I created. It may not be the best usage of pandas but it creates the DataFrame you are looking for.
You can do this :
data = {
'Details': lists_
}
df = pd.DataFrame.from_dict(data)
df.to_csv('out.csv', index = 0)
you should write this outside the for loop that you've.

HTTPError: 401 Client Error: when accessing a website by python but not by chome browser

import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt,time
import os
from pathlib import Path
import pandas as pd
import numpy as np
import datetime as dt
from pathlib import Path
df= ['ACC', 'ADANIENT', 'ADANIPORTS', 'AMARAJABAT', 'AMBUJACEM', 'APOLLOHOSP', 'APOLLOTYRE', 'ASHOKLEY', 'ASIANPAINT', 'AUROPHARMA', 'AXISBANK', 'BAJAJ-AUTO', 'BAJAJFINSV', 'BAJFINANCE', 'BALKRISIND', 'BANDHANBNK', 'BANKBARODA',
'BATAINDIA', 'BEL', 'BERGEPAINT', 'BHARATFORG', 'BHARTIARTL', 'BHEL', 'BIOCON', 'BOSCHLTD', 'BPCL', 'BRITANNIA','CADILAHC', 'CANBK', 'CHOLAFIN', 'CIPLA', 'COALINDIA', 'COFORGE', 'COLPAL', 'CONCOR', 'CUMMINSIND', 'DABUR', 'DIVISLAB', 'DLF', 'DRREDDY', 'EICHERMOT', 'ESCORTS', 'EXIDEIND', 'FEDERALBNK', 'GAIL', 'GLENMARK', 'GMRINFRA', 'GODREJCP', 'GODREJPROP', 'GRASIM', 'HAVELLS', 'HCLTECH', 'HDFC', 'HDFCBANK', 'HDFCLIFE', 'HEROMOTOCO', 'HINDALCO', 'HINDPETRO', 'HINDUNILVR', 'IBULHSGFIN', 'ICICIBANK', 'ICICIPRULI', 'IDEA', 'IDFCFIRSTB', 'IGL',
'INDIGO','INDUSINDBK','INFRATEL','INFY','IOC','ITC', 'JINDALSTEL', 'JSWSTEEL', 'JUBLFOOD', 'KOTAKBANK', 'L&TFH', 'LICHSGFIN', 'LT', 'LUPIN', 'M&M', 'M&MFIN', 'MANAPPURAM', 'MARICO','MARUTI','MCDOWELL-N','MFSL','MGL', 'MINDTREE','MOTHERSUMI', 'MRF', 'MUTHOOTFIN', 'NATIONALUM', 'NAUKRI','NESTLEIND','NMDC', 'NTPC','ONGC', 'PAGEIND','PEL', 'PETRONET',
'PFC', 'PIDILITIND','PNB', 'POWERGRID','PVR', 'RAMCOCEM','RBLBANK', 'RECLTD','RELIANCE', 'SAIL','SBILIFE', 'SBIN','SHREECEM','SIEMENS','SRF','SRTRANSFIN','SUNPHARMA','SUNTV','TATACHEM', 'TATACONSUM','TATAMOTORS', 'TATAPOWER','TATASTEEL', 'TCS','TECHM', 'TITAN','TORNTPHARM', 'TORNTPOWER', 'TVSMOTOR', 'UBL','ULTRACEMCO','UPL', 'VEDL', 'VOLTAS','WIPRO', 'ZEEL']
for k in df:
print(k)
k1=k
k=k.replace('&','%26')
url = "https://www.nseindia.com/api/option-chain-equities?symbol="+str(k)
# headers = {"user-agent": "Mozilla/5.0"}
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) chrome/80.0.3987.132 Safari/537.36','Accept-Language': 'en-US,en;q=0.9','Accept-Encoding': 'gzip, deflate'}
response = requests.get(url, headers=headers)
response.raise_for_status()
data1 = response.json()
if not data1:
print("empty # "+ k)
else:
data=data1["records"]["data"]
columns = pd.DataFrame(data[0]).transpose().columns
final_data=pd.DataFrame(columns=columns)
for i in range(0,len(data)):
z=pd.DataFrame(data[i])
zz=z.transpose()
if(len(zz.index)==3):
final_data=final_data.append(zz.loc[[zz.index[2]]])
elif(len(zz.index)==4):
final_data=final_data.append(zz.loc[[zz.index[2]]])
final_data=final_data.append(zz.loc[[zz.index[3]]])
final_data=final_data.reset_index()
final_data=final_data.rename(columns = {'index':'option_type'})
final_data=final_data[(final_data['openInterest']>0)&(final_data['changeinOpenInterest']>0)].reset_index(drop=True)
path=Path('E:\code and finance\Data\FNO\Equity\options',k1,(dt.datetime.today()).strftime("%d-%m-%Y")+".xlsx")
final_data.to_excel(path)
I was using the code to get the data but recently i started getting HTTPError: 401 Client Error: when accessing a website by python but not by chrome browser. What should i do to access the website like i sed to in python.
The link work in chrome but not in python

Collect the Dropdown List from Link using Request

I have a link as below:
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?segmentLink=17&instrument=OPTIDX&symbol=BANKNIFTY&date=9JAN2020"
I want to collect all the Expiry Date available as per the image below:
My Code:
########################
import pandas as pd
from requests import Session
import os, time, sys
from datetime import datetime
s = Session()
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '\
'AppleWebKit/537.36 (KHTML, like Gecko) '\
'Chrome/75.0.3770.80 Safari/537.36'}
# Add headers
s.headers.update(headers)
URL = 'https://www.nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp'
params = {'symbolCode':9999,'symbol':'BANKNIFTY','instrument': '-','date': '9JAN2020','segmentLink': 17}
res = s.get(URL, params=params)
df1 = pd.read_html(res.content)[0]
df2 = pd.read_html(res.content)[1]
Not able to get the values in df1 nor df2
It needs minimal knowlege of requests and BeautifulSoup or lxml
import requests
import lxml.html
url = 'https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?segmentLink=17&instrument=OPTIDX&symbol=BANKNIFTY&date=9JAN2020'
r = requests.get(url)
soup = lxml.html.fromstring(r.text)
items = soup.xpath('//form[#id="ocForm"]//option/text()')
print(items)
Result
[' Select ', '9JAN2020', '16JAN2020', '23JAN2020', '30JAN2020', '6FEB2020', '13FEB2020', '20FEB2020', '27FEB2020', '5MAR2020', '26MAR2020']
import pandas as pd
from requests import Session
import lxml.html
s = Session()
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '\
'AppleWebKit/537.36 (KHTML, like Gecko) '\
'Chrome/75.0.3770.80 Safari/537.36'}
# Add headers
s.headers.update(headers)
URL = 'https://www.nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp'
params = {'symbolCode':9999,'symbol':'BANKNIFTY','instrument': 'OPTIDX','date': '-','segmentLink': 17}
res = s.get(URL, params=params)
soup = lxml.html.fromstring(res.text)
items = soup.xpath('//form[#id="ocForm"]//option/text()')
print(items)
text = pd.read_html(res.content)[0].loc[0, 1]
print(text)

Categories