I am trying that my columns will auto-adjust according to the data but they will provide me an error is there a possible way that my data are fit in the excel file they show me this error ValueError: Shape of passed values is (1, 12), indices imply (1, 1) is there any way to solve these error kindly check it if there is any visible solution for it
indices imply (1, 1)` is there any way to solve these error kindly check it
import enum
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import numpy as np
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
r = requests.get("https://www.fleetpride.com/parts/otr-coiled-air-hose-otr6818")
soup = BeautifulSoup(r.content, "html5lib")
raw_json = ""
for table_index,table in enumerate( soup.find_all("script")):
if('CCRZ.detailData.jsonProductData = {"' in str(table)):
x=str(table).split('CCRZ.detailData.jsonProductData = {"')
raw_json = "{\""+str(x[-1]).split('};')[0]+"}"
break
req_json = json.loads(raw_json)
# with open("text_json.json","w")as file:
# x=json.dump(req_json,file,indent=4)
temp = req_json
cat=temp['product']['prodBean']['friendlyUrl'][:11]
catu=temp['product']['prodBean']['friendlyUrl'][11:55]
catk=temp['product']['prodBean']['friendlyUrl'][56:71]
cup=temp['product']['prodBean']['friendlyUrl'][72:]
title=temp['product']['prodBean']['name']
specification=temp['product']['prodBean']['sku']
spec1=temp['product']['prodBean']['productSpecsS'][15]['specValue']
spec2=temp['product']['prodBean']['productSpecsS'][30]['specValue']
spec3=temp['product']['prodBean']['productSpecsS'][28]['specValue']
spec4=temp['product']['prodBean']['productSpecsS'][29]['specValue']
spec5=temp['product']['prodBean']['productSpecsS'][27]['specValue']
spec6=temp['product']['prodBean']['productSpecsS'][18]['specValue']
spec7=temp['product']['prodBean']['productSpecsS'][19]['specValue']
spec8=temp['product']['prodBean']['productSpecsS'][20]['specValue']
fea=spec6+spec7+spec8
spec11=temp['product']['prodBean']['ECrossReferencesS'][0]['Interchange_Part_Number__c']
spec12=temp['product']['prodBean']['ECrossReferencesS'][1]['Interchange_Part_Number__c']
spec13=temp['product']['prodBean']['ECrossReferencesS'][2]['Interchange_Part_Number__c']
spec14=temp['product']['prodBean']['ECrossReferencesS'][3]['Interchange_Part_Number__c']
spec15=temp['product']['prodBean']['ECrossReferencesS'][4]['Interchange_Part_Number__c']
spec16=temp['product']['prodBean']['ECrossReferencesS'][5]['Interchange_Part_Number__c']
cross=spec11+spec12+spec13+spec14+spec15+spec16
wev=[]
web={
'category':cat,
'sub_category':catu,
'sub_category1':catk,
'sub_category2':cup,
'name':title,
'Model_No':specification,
'VMRS':spec1,
'width_each':spec2,
'Quantity':spec3,
'Height_each':spec4,
'cross_reference':cross,
'feature':fea
}
# print(web)
wev.append(web)
df = pd.DataFrame(np.random.randint(0,100,size=(1, 12)),columns=wev)
# print(df)
df.to_csv('second.csv', index=False, encoding='utf-8')
Well, the following code should work:
writer = pd.ExcelWriter('test_file.xlsx')
df.to_excel(writer, sheet_name='test_sheet', index=False, na_rep='NaN')
for column in df:
column_width = max(df[column].astype(str).map(len).max(), len(column))
col_idx = df.columns.get_loc(column)
writer.sheets['test_sheet'].set_column(col_idx, col_idx, column_width)
writer.save()
Related
from datetime import timedelta, date
from nsepy import get_history
import pandas as pd
def importdata(stock):
stock_fut = get_history(symbol=stock,
start=date.today() - timedelta(days = 3), end=date.today(),
futures=True,
expiry_date=date(2022,9,29))
#print(stock_fut.columns)
print(stock_fut[["Symbol","Number of Contracts","Change in OI","Open Interest"]])
a = ["ULTRACEMCO"]
#a = ["CONCOR", "JKCEMENT","SHREECEM","RAMCOCEM","INDIGO","ACC","BAJAJ-AUTO","ULTRACEMCO","PERSISTENT","MARUTI"]
for i in range(0,len(a)):
#print(a[i])
#importdata(a[i])
df = pd.DataFrame(a[i])
print(df)
Unable to do same with error as DataFrame is not called properly.
Also I want to join all data for all symbols in single table.
import requests
import json
import codecs
import pandas as pd
baseurl = "https://www.nseindia.com/"
url = f'https://www.nseindia.com/api/live-analysis-oi-spurts-underlyings'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, ''like Gecko) ''Chrome/80.0.3987.149 Safari/537.36','accept-language': 'en,gu;q=0.9,hi;q=0.8', 'accept-encoding': 'gzip, deflate, br'}
session = requests.Session()
request = session.get(baseurl, headers=headers, timeout=30)
cookies = dict(request.cookies)
res = session.get(url, headers=headers, timeout=30, cookies=cookies)
df = pd.DataFrame(json.loads(codecs.decode(bytes(res.text, 'utf-8'), 'utf-8-sig'))['data'])
mini_df = df[['symbol']]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
#print(df)
#print(mini_df)
print(mini_df.to_string(index=False))
Sir what if I want this symbol output to give input in below code for value of "a".
I tried to fix your code with minimal modification. Hope it helps;
from datetime import timedelta, date
from nsepy import get_history
import pandas as pd
import requests
import json
import codecs
baseurl = "https://www.nseindia.com/"
url = f'https://www.nseindia.com/api/live-analysis-oi-spurts-underlyings'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, ''like Gecko) ''Chrome/80.0.3987.149 Safari/537.36','accept-language': 'en,gu;q=0.9,hi;q=0.8', 'accept-encoding': 'gzip, deflate, br'}
session = requests.Session()
request = session.get(baseurl, headers=headers, timeout=30)
cookies = dict(request.cookies)
res = session.get(url, headers=headers, timeout=30, cookies=cookies)
# hereby we use from_dict module of pandas.DataFrame in order to make a dataframe from a collection of dictionaries inside a dictionary.
df1 = pd.DataFrame.from_dict(json.loads(codecs.decode(bytes(res.text, 'utf-8'), 'utf-8-sig'))['data'])
# and here we get a list unique of symbols
a = df1.symbol.unique().tolist()
def importdata(stock):
stock_fut = get_history(symbol=stock,
start=date.today() - timedelta(days = 3), end=date.today(),
futures=True,
expiry_date=date(2022,9,29))
return stock_fut[["Symbol","Number of Contracts","Change in OI","Open Interest"]]
# here we add all the dataframes to a list
df_list = []
for i in a:
temp = importdata(i)
temp_df = pd.DataFrame(temp)
df_list.append(temp_df)
# and here we concatenate all of them together in a row-wise manner.
df = pd.concat(df_list)
print (df)
Change the line to:
df = pd.DataFrame([a[i]])
looking for help I have exhausted my limited ability. The below code gives me everything I need except there are two embedded field 'sectionaltimes' and 'splittimes' which I also need to be included in the dataframe and therefore exported to excel as individual components rather than as a long string.
import requests
import json
import pandas as pd
import xlsxwriter
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0',
'Accept-Language' : 'en-US,en;q=0.5'}
df=pd.DataFrame()
for race in range(1, 9):
url = f"https://s3-ap-southeast-2.amazonaws.com/racevic.static/2015-01-01/flemington/sectionaltimes/race-{race}.json?callback=sectionaltimes_callback"
r = requests.get(url, headers=headers)
json_obj = json.loads(r.text.split('sectionaltimes_callback(')[1].rsplit(')', 1)[0])
main_df=pd.DataFrame(json_obj['Horses'])
df=pd.concat([df,main_df])
df.reset_index(drop=True, inplace=True)
df.to_excel("20150101.xlsx")
I want the data in 'data frame' the code is working perfectly please solve these issue and provide data in Data Frame I try to solve it but faliure to do these
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
browser = webdriver.Chrome('F:\chromedriver.exe')
browser.get("https://capitalonebank2.bluematrix.com/sellside/Disclosures.action")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"}
for title in browser.find_elements_by_css_selector('option'):
title.click()
time.sleep(1)
browser.switch_to.frame(browser.find_elements_by_css_selector("iframe")[1])
table = browser.find_element_by_css_selector("table table")
soup = BeautifulSoup(table.get_attribute("innerHTML"), "lxml")
all_data = []
ratings = {"BUY":[], "HOLD":[], "SELL":[]}
lists_ = []
for row in soup.select("tr")[-4:-1]:
info_list = row.select("td")
count = info_list[1].text
percent = info_list[2].text
IBServ_count = info_list[4].text
IBServ_percent = info_list[5].text
lists_.append([count, percent, IBServ_count, IBServ_percent])
ratings["BUY"] = lists_[0]
ratings["HOLD"] = lists_[1]
ratings["SELL"] = lists_[2]
You can find the solution below:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
browser = webdriver.Chrome('F:\chromedriver.exe')
browser.get("https://capitalonebank2.bluematrix.com/sellside/Disclosures.action")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"}
title_lists = []
buy_lists = []
hold_lists = []
sell_lists = []
for title in browser.find_elements_by_css_selector('option'):
title.click()
time.sleep(1)
title_lists.append(title.text)
browser.switch_to.frame(browser.find_elements_by_css_selector("iframe")[1])
table = browser.find_element_by_css_selector("table table")
soup = BeautifulSoup(table.get_attribute("innerHTML"), "lxml")
lists_ = []
for row in soup.select("tr")[-4:-1]:
info_list = row.select("td")
count = info_list[1].text
percent = info_list[2].text
IBServ_count = info_list[4].text
IBServ_percent = info_list[5].text
lists_.append([count, percent, IBServ_count, IBServ_percent])
buy_lists.append(lists_[0])
hold_lists.append(lists_[1])
sell_lists.append(lists_[2])
browser.switch_to.default_content()
header = pd.MultiIndex.from_product([['BUY','HOLD', 'SELL'],
['Count','Percent','IBServ_count', 'IBServ_percent']],names=['Action','Rating'])
m = np.array([[i[0] for i in buy_lists], [i[1] for i in buy_lists], [i[2] for i in buy_lists], [i[3] for i in buy_lists],
[i[0] for i in hold_lists], [i[1] for i in hold_lists], [i[2] for i in hold_lists], [i[3] for i in hold_lists],
[i[0] for i in sell_lists], [i[1] for i in sell_lists], [i[2] for i in sell_lists], [i[3] for i in sell_lists]])
dc = pd.DataFrame(np.rot90(m),columns = header)
dc["Title"] = title_lists
dc = dc.set_index("Title")
dc.to_csv('out.csv', index = 0)
I used numpy to manipulate the array of information I created. It may not be the best usage of pandas but it creates the DataFrame you are looking for.
You can do this :
data = {
'Details': lists_
}
df = pd.DataFrame.from_dict(data)
df.to_csv('out.csv', index = 0)
you should write this outside the for loop that you've.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
def scrap_hrefs(url,baseUrl):
resp = requests.get(url, headers= header)
respData = BeautifulSoup(resp.content, 'html.parser')
allHrefs = respData.select('[href]')
return allHrefs, baseUrl
def get_hrefs(allHrefs, baseUrl):
for i in range(0,len(allHrefs)):
if allHrefs[i]['href'].startswith('/'):
allHrefs[i]= baseUrl + allHrefs[i]['href']
else:
allHrefs[i]= allHrefs[i]['href']
return allHrefs
def clean_hrefs(allHrefs):
links = {'links' : allHrefs}
df = pd.DataFrame(links).drop_duplicates()
df = df[df['links'].str.contains('financial|investors|investor|Investors|Investor|INVESTORS|INVESTOR|relations|relation|Relations|Relation|report|filings|news|media')]
for i in range(0,len(df)):
if df[i]['links'].str.find('financial|investors|investor|Investors|Investor|INVESTORS|INVESTOR|relations|relation|Relations|Relation|report|filings')!= -1:
df[i]['segments'] = df['Finance']
else:
continue
return df
def store_hrefs(df):
df.to_csv("testing.csv", index=False)
def run_scraper(url,baseUrl) :
store_hrefs(clean_hrefs(get_hrefs(*scrap_hrefs(url, baseUrl))))
run_scraper('https://www.example.com/','https://www.example.com')
In clean_hrefs() function, I want to get the first link from the data frame, check if it's content has the word 'finance, investors, ir, report, filings'. If it does, create another column called 'segments' and assign it id 'FINANCE'.
But it's giving an error. KeyError: 0
Any help would be much appreciated!
You can set column to another one by mask, similar like filtration, if no matching get missing values:
mask = df['links'].str.contains('financial|investors|investor|Investors|Investor|INVESTORS|INVESTOR|relations|relation|Relations|Relation|report|filings')
df.loc[mask, 'segments'] = 'Finance'
working like:
df['segments'] = np.where(mask, 'Finance', np.nan)
EDIT:
If want set multiple values you can specify new values in dictionary and then set column segments like:
d = {'INVESTOR':'financial|investors|investor|Investors|Investor|INVESTORS|INVESTOR|relations|relation|Relations|Relation|report|filings',
'NEWS':'news|media'}
for k, v in d.items():
df.loc[df['links'].str.contains(v, na=False), 'segmentID'] = k
I am trying to automate a search which returns a table of information. I am able to print the results in .text but my question is how can I pass the results into a Pandas dataframe. The reason why I am asking is two fold; because I would want to print the results into a CSV file and I need the results in Pandas to do data analysis later on. Appreciate if anyone could help. My code as below:
import time
from selenium import webdriver
import pandas as pd
search = ['0501020210597400','0501020210597500','0501020210597600']
df = pd.DataFrame(search)
chrome_path = [Chrome Path]
driver = webdriver.Chrome(chrome_path)
driver.get('https://enquiry.mpsj.gov.my/v2/service/cuk_search/')
x = 0
while x <(len(df.index)):
search_box = driver.find_element_by_name('sel_value')
new_line = (df[0][x]).format(x)
search_box.send_keys(new_line)
search_box.submit()
time.sleep(5)
table = driver.find_elements_by_class_name('tr-body')
for data in table:
print(data.text)
driver.find_element_by_name('sel_value').clear()
x +=1
driver.close()
To load a CSV file to a DataFrame, you can do:
df = pd.read_csv('example.csv')
See the online doc: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html#pandas.read_csv
To write the data to CSV, consult this article: Pandas writing dataframe to CSV file on SO.
The solution is:
df.to_csv(file_name, sep='\t')
You can use requests and do a POST to get the info rather than use selenium
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
search = ['0501020210597400','0501020210597500','0501020210597600']
headers = {'Referer' : 'https://enquiry.mpsj.gov.my/v2/service/cuk_search/1',
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
output = []
dfHeaders = ['No.', 'No. Akaun', 'Nama Di Bil', 'Jumlah Perlu Dibayar', '']
with requests.Session() as s:
for item in search:
r = s.get('https://enquiry.mpsj.gov.my/v2/service/cuk_search/1', headers = headers)
soup = bs(r.content, 'lxml')
key = soup.select_one('[name=ACCESS_KEY]')['value']
body = {'sel_input': 'no_akaun', 'sel_value': item, 'ACCESS_KEY': key}
res = s.post('https://enquiry.mpsj.gov.my/v2/service/cuk_search_submit/', data = body)
soup = bs(res.content, 'lxml')
table = soup.select_one('.tbl-list')
rows = table.select('.tr-body')
for row in rows:
cols = row.find_all('td')
cols = [item.text.strip() for item in cols]
output.append([item for item in cols if item])
df = pd.DataFrame(output, columns = dfHeaders)
print(df)
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )