Why does BeautifulSoup fail to extract data from websites to csv? - python
User Chrisvdberge helped me creating the following code :
import pandas as pd
import requests
from bs4 import BeautifulSoup
url_DAX = 'https://www.eurexchange.com/exchange-en/market-data/statistics/market-statistics-online/100!onlineStats?viewType=4&productGroupId=13394&productId=34642&cp=&month=&year=&busDate=20191114'
req = requests.get(url_DAX, verify=False)
html = req.text
soup = BeautifulSoup(html, 'lxml')
df = pd.read_html(str(html))[0]
df.to_csv('results_DAX.csv')
print(df)
url_DOW = 'https://www.cmegroup.com/trading/equity-index/us-index/e-mini-dow_quotes_settlements_futures.html'
req = requests.get(url_DOW, verify=False)
html = req.text
soup = BeautifulSoup(html, 'lxml')
df = pd.read_html(str(html))[0]
df.to_csv('results_DOW.csv')
print(df)
url_NASDAQ = 'https://www.cmegroup.com/trading/equity-index/us-index/e-mini-nasdaq-100_quotes_settlements_futures.html'
req = requests.get(url_NASDAQ, verify=False)
html = req.text
soup = BeautifulSoup(html, 'lxml')
df = pd.read_html(str(html))[0]
df.to_csv('results_NASDAQ.csv')
print(df)
url_CAC = 'https://live.euronext.com/fr/product/index-futures/FCE-DPAR/settlement-prices'
req = requests.get(url_CAC, verify=False)
html = req.text
soup = BeautifulSoup(html, 'lxml')
df = pd.read_html(str(html))[0]
df.to_csv('results_CAC.csv')
print(df)
I have the following result :
3 .csv files are created : results_DAX.csv (here, everything is ok, I have the values I want.) ; results_DOW.csv and results_NASDAQ.csv (here, the problem is that the .csv files don't have the wanted values.. I don't understand why ?)
As you can see in the code, 4 files should be created and not only 3.
So my questions are :
How to get 4 csv files ?
How to get values in the results_DOW.csv and in the results_NASDAQ.csv files ? (and maybe also in the results_CAC.csv file)
Thank you for your answers ! :)
Try this to get those other sites. The last site is a little trickier, so you'd need to try out Selenium:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta
url_DAX = 'https://www.eurexchange.com/exchange-en/market-data/statistics/market-statistics-online/100!onlineStats?viewType=4&productGroupId=13394&productId=34642&cp=&month=&year=&busDate=20191114'
df = pd.read_html(url_DAX)[0]
df.to_csv('results_DAX.csv')
print(df)
dt = date.today() - timedelta(days=2)
dateParam = dt.strftime('%m/%d/%Y')
url_DOW = 'https://www.cmegroup.com/CmeWS/mvc/Settlements/Futures/Settlements/318/FUT'
payload = {
'tradeDate': dateParam,
'strategy': 'DEFAULT',
'pageSize': '500',
'_': '1573920502874'}
response = requests.get(url_DOW, params=payload).json()
df = pd.DataFrame(response['settlements'])
df.to_csv('results_DOW.csv')
print(df)
url_NASDAQ = 'https://www.cmegroup.com/CmeWS/mvc/Settlements/Futures/Settlements/146/FUT'
payload = {
'tradeDate': dateParam,
'strategy': 'DEFAULT',
'pageSize': '500',
'_': '1573920650587'}
response = requests.get(url_NASDAQ, params=payload).json()
df = pd.DataFrame(response['settlements'])
df.to_csv('results_NASDAQ.csv')
print(df)
Related
Webscraping - html issue
Ive been trying this code and getting some success but cannot figure out the next step import pandas as pd import requests from termcolor import colored from bs4 import BeautifulSoup import requests import lxml.html as lh import pprint import json url2 = "https://www.procyclingstats.com/rankings.php" print(colored('#Step1','green')) response = requests.get(url2) soup = BeautifulSoup(response.text, 'lxml') table = soup.find('table', {'class':'basic'}) headers = [heading.text for heading in table.find_all('th',{"class":"cu600"})] #print(headers) #why do I only get two headers here (prev, team)? response = requests.get(url2) dfs = pd.read_html(response.text)[0] #print(list(dfs)) #with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also # print(dfs) print(colored('#Step1','red')) print(colored('#Step2','green')) url3 = "https://www.procyclingstats.com/rider/tadej-pogacar" response = requests.get(url3) soup = BeautifulSoup(response.text, 'lxml') table2 = soup.find({'class':'class="mt10 pps"'}) #headers = [heading.text for heading in table1.find_all('th',{"class":"cu600"})] #print(headers) # Usually the line below is enough # But for some reason returning Forbidden #dfs = pd.read_html(url)[0] response = requests.get(url3) dfs2 = pd.read_html(response.text)[0] #with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also #print(dfs2) child_soup = soup.find('h3') for i in child_soup.children: print("child : ", i) print('\n'*3) I end up with the child as Rider (result text below) Ive been trying this code and getting some success but cannot figure out the next step import pandas as pd import requests from termcolor import colored from bs4 import BeautifulSoup import requests import lxml.html as lh import pprint import json url2 = "https://www.procyclingstats.com/rankings.php" print(colored('#Step1','green')) response = requests.get(url2) soup = BeautifulSoup(response.text, 'lxml') table = soup.find('table', {'class':'basic'}) headers = [heading.text for heading in table.find_all('th',{"class":"cu600"})] #print(headers) #why do I only get two headers here (prev, team)? response = requests.get(url2) dfs = pd.read_html(response.text)[0] #print(list(dfs)) #with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also # print(dfs) print(colored('#Step1','red')) print(colored('#Step2','green')) url3 = "https://www.procyclingstats.com/rider/tadej-pogacar" response = requests.get(url3) soup = BeautifulSoup(response.text, 'lxml') table2 = soup.find({'class':'class="mt10 pps"'}) #headers = [heading.text for heading in table1.find_all('th',{"class":"cu600"})] #print(headers) # Usually the line below is enough # But for some reason returning Forbidden #dfs = pd.read_html(url)[0] response = requests.get(url3) dfs2 = pd.read_html(response.text)[0] #with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also #print(dfs2) child_soup = soup.find('h3') for i in child_soup.children: print("child : ", i) print('\n'*3) I end up with the child as Rider (result text below) #Step2 child : Rider What Im trying to capture is the 'points per speciality' and the values. There is a second question as to why I only get two tags when they all appear to have the same name? Photo and arrow showing desired result
import re url2 = "https://www.procyclingstats.com/rankings.php" response = requests.get(url2) soup = BeautifulSoup(response.text, "lxml") table = soup.find("table", {"class": "basic"}) thead = table.find("thead") headers = [heading.text for heading in thead.find_all("th")] url3 = "https://www.procyclingstats.com/rider/tadej-pogacar" response = requests.get(url3) soup = BeautifulSoup(response.text, "lxml") ul = soup.find("ul", {"class": "basic"}) li = ul.find_all("li") d = {} for l in li: m = re.search("(\d+)(.*)", l.text) d[m.group(2)] = m.group(1) print(headers) print(d) # ['#', 'Prev.', 'Diff.', 'Rider', 'Team', 'Points'] # {'One day races': '1641', 'GC': '3444', 'Time trial': '1147', 'Sprint': '302', 'Climber': '3816'}
Turning find_all() results into text so it can be usable in a pandas dataframe
I need to scrape content that is inside a div class inside another div class which repeats so I needed to use a find_all to get them. I want to be able to get them in text so when I put them in a dataframe it says the name of the object inside as you would get when you do a find(...).text instead of the entire html line import requests from bs4 import BeautifulSoup import pandas as pd url = 'https://www.grammy.com/grammys/awards/winners-nominees/138' page = requests.get(url).text soup = BeautifulSoup(page,'lxml') category = soup.find_all('div', class_ = "view-grouping-content") print(len(category)) for c in category: artistName = c.find_all('div', class_ = "views-field views-field-field-description")
import requests from bs4 import BeautifulSoup def main(url): r = requests.get(url) soup = BeautifulSoup(r.text, 'lxml') goal = [x.text for x in soup.select( '.freelink.freelink-nid.freelink-internal')] print(goal) main('https://www.grammy.com/grammys/awards/winners-nominees/138')
Another angle of attack (if this could be from another site)... import pandas as pd import requests url = r'https://en.wikipedia.org/wiki/Grammy_Award_for_Record_of_the_Year' page = requests.get(url) tables = pd.read_html(page.text) df = pd.concat(tables[1:9]) df.dropna(thresh=3, inplace=True) df = df.rename(columns={'Year[I]':'Year'}) df['Year'] = df['Year'].str.replace('\[\d+\]', '', regex=True) df['Record'] = df['Record'].str.replace('"', '', regex=False) print(df) Outputs:
Data Scrape Output into Dataframe
Hello Everyone I have Scraped this Information from a JobListing site so far. Everything seems to work well however I am struggling to get this information into a data frame with headers and everything. Any Help is appreciated. My Full code is: import requests from bs4 import BeautifulSoup import pandas as pd URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia' page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') results = soup.find(id='ResultsContainer') python_jobs = results.find_all('h2',string=lambda text: 'test' in text.lower()) for p_job in python_jobs: link = p_job.find('a')['href'] print(p_job.text.strip()) print(f"Apply Here: {link}") job_elems = results.find_all('section', class_= 'card-content') for job_elem in job_elems: title_elem = job_elem.find('h2', class_='title') company_elem = job_elem.find('div', class_='company') location_elem = job_elem.find('div', class_='location') if None in (title_elem, company_elem, location_elem): continue print(title_elem.text.strip()) print(company_elem.text.strip()) print(location_elem.text.strip()) print() Not sure how to approach this.
use concat() for all columns and then append() to one dataframe in loop import requests from bs4 import BeautifulSoup import pandas as pd URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia' page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') results = soup.find(id='ResultsContainer') python_jobs = results.find_all('h2',string=lambda text: 'test' in text.lower()) for p_job in python_jobs: link = p_job.find('a')['href'] print(p_job.text.strip()) print(f"Apply Here: {link}") job_elems = results.find_all('section', class_= 'card-content') df= pd.DataFrame() for job_elem in job_elems: title_elem = job_elem.find('h2', class_='title') company_elem = job_elem.find('div', class_='company') location_elem = job_elem.find('div', class_='location') if None in (title_elem, company_elem, location_elem): continue df1=pd.concat([pd.Series(title_elem.text.strip()), pd.Series(company_elem.text.strip()), pd.Series(location_elem.text.strip())],axis=1) df=df.append(df1) print(df)
You can save the job details (i.e, title, company, and location) in a dictionary, then dataframe the dictionary. import requests from bs4 import BeautifulSoup import pandas as pd URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia' page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') results = soup.find(id='ResultsContainer') python_jobs = results.find_all('h2',string=lambda text: 'test' in text.lower()) for p_job in python_jobs: link = p_job.find('a')['href'] print(p_job.text.strip()) print(f"Apply Here: {link}") job_elems = results.find_all('section', class_= 'card-content') i = 1 my_job_list = {} for job_elem in job_elems: title_elem = job_elem.find('h2', class_='title') company_elem = job_elem.find('div', class_='company') location_elem = job_elem.find('div', class_='location') if None in (title_elem, company_elem, location_elem): continue op = f'opening {i}' my_job_list[op] = {'position':title_elem.text.strip(), 'company': company_elem.text.strip(), 'location': location_elem.text.strip()} i= i+1 print(title_elem.text.strip()) print(company_elem.text.strip()) print(location_elem.text.strip()) df = pd.DataFrame(my_job_list) print(df)
Get value from web link
I have a url from where I want to extract the line having data as "Underlying Stock: NCC 96.70 As on Jun 06, 2019 10:12:20 IST" and extract the Symbol which is "NCC" and Underlying Price is "96.70" into a list. url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"
You can make a request to the site and then parse the result with Beautiful Soup. Try this: from bs4 import BeautifulSoup import requests url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17" res = requests.get(url) soup = BeautifulSoup(res.text) # hacky way of finding and parsing the stock data soup.get_text().split("Underlying Stock")[1][2:10].split(" ") This prints out: ['NCC', '96.9'] PS: If you get a warning about lxml... It is the default parser given that you have installed it. Change this line then: soup = BeautifulSoup(res.text, features="lxml"). You need to have lxml installed e.g. with pip install lxml in your environment.
Another version, less hacky. url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17" page_html = requests.get(url).text page_soup = BeautifulSoup(page_html, "html.parser") page_soup.find("b").next.split(' ')
A succinct way is to select for the first right aligned table cell (td[align=right]) ; which you can actually simplify to just the attribute, [align=right]: from bs4 import BeautifulSoup as bs import requests r = requests.get('https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17') soup = bs(r.content, 'lxml') headline = soup.select_one('[align=right]').text.strip().replace('\xa0\n',' ') print(headline) You can also take first row of first table from bs4 import BeautifulSoup import requests r = requests.get('https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17') soup = bs(r.content, 'lxml') table = soup.select_one('table') headline = table.select_one('tr:nth-of-type(1)').text.replace('\n',' ').replace('\xa0', ' ').strip() print(headline)
from bs4 import BeautifulSoup import requests url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17" res = requests.get(url) soup = BeautifulSoup(res.text, "lxml") # hacky way of finding and parsing the stock data mylist = soup.get_text().split("Underlying Stock")[1][2:10].split(" ") print(mylist[:2]) ============================= import pandas as pd dict1 = {'SYMBOL': ['ACC','ADANIENT','ADANIPORTS','ADANIPOWER','AJANTPHARM','ALBK','AMARAJABAT','AMBUJACEM','APOLLOHOSP','APOLLOTYRE','ARVIND','ASHOKLEY','ASIANPAINT','AUROPHARMA','AXISBANK','BAJAJ-AUTO','BAJAJFINSV','BAJFINANCE','BALKRISIND','BANKBARODA','BANKINDIA','BANKNIFTY','BATAINDIA','BEL','BEML','BERGEPAINT','BHARATFIN','BHARATFORG','BHARTIARTL','BHEL','BIOCON','BOSCHLTD','BPCL','BRITANNIA','BSOFT','CADILAHC','CANBK','CANFINHOME','CASTROLIND','CEATLTD','CENTURYTEX','CESC','CGPOWER','CHENNPETRO','CHOLAFIN','CIPLA','COALINDIA','COLPAL','CONCOR','CUMMINSIND','DABUR','DCBBANK','DHFL','DISHTV','DIVISLAB','DLF','DRREDDY','EICHERMOT','ENGINERSIN','EQUITAS','ESCORTS','EXIDEIND','FEDERALBNK','GAIL','GLENMARK','GMRINFRA','GODFRYPHLP','GODREJCP','GODREJIND','GRASIM','GSFC','HAVELLS','HCLTECH','HDFC','HDFCBANK','HEROMOTOCO','HEXAWARE','HINDALCO','HINDPETRO','HINDUNILVR','HINDZINC','IBULHSGFIN','ICICIBANK','ICICIPRULI','IDBI','IDEA','IDFC','IDFCFIRSTB','IFCI','IGL','INDIACEM','INDIANB','INDIGO','INDUSINDBK','INFIBEAM','INFRATEL','INFY','IOC','IRB','ITC','JETAIRWAYS','JINDALSTEL','JISLJALEQS','JSWSTEEL','JUBLFOOD','JUSTDIAL','KAJARIACER','KOTAKBANK','KSCL','KTKBANK','L&TFH','LICHSGFIN','LT','LUPIN','M&M','M&MFIN','MANAPPURAM','MARICO','MARUTI','MCDOWELL-N','MCX','MFSL','MGL','MINDTREE','MOTHERSUMI','MRF','MRPL','MUTHOOTFIN','NATIONALUM','NBCC','NCC','NESTLEIND','NHPC','NIFTY','NIFTYIT','NIITTECH','NMDC','NTPC','OFSS','OIL','ONGC','ORIENTBANK','PAGEIND','PCJEWELLER','PEL','PETRONET','PFC','PIDILITIND','PNB','POWERGRID','PVR','RAMCOCEM','RAYMOND','RBLBANK','RECLTD','RELCAPITAL','RELIANCE','RELINFRA','REPCOHOME','RPOWER','SAIL','SBIN','SHREECEM','SIEMENS','SOUTHBANK','SRF','SRTRANSFIN','STAR','SUNPHARMA','SUNTV','SUZLON','SYNDIBANK','TATACHEM','TATACOMM','TATAELXSI','TATAGLOBAL','TATAMOTORS','TATAMTRDVR','TATAPOWER','TATASTEEL','TCS','TECHM','TITAN','TORNTPHARM','TORNTPOWER','TV18BRDCST','TVSMOTOR','UBL','UJJIVAN','ULTRACEMCO','UNIONBANK','UPL','VEDL','VGUARD','VOLTAS','WIPRO','WOCKPHARMA','YESBANK','ZEEL'], 'LOT_SIZE': [400,4000,2500,20000,500,13000,700,2500,500,3000,2000,4000,600,1000,1200,250,125,250,800,4000,6000,20,550,6000,700,2200,500,1200,1851,7500,900,30,1800,200,2250,1600,2000,1800,3400,400,600,550,12000,1800,500,1000,2200,700,1563,700,1250,4500,1500,8000,400,2600,250,25,4100,4000,1100,2000,7000,2667,1000,45000,700,600,1500,750,4700,1000,700,500,250,200,1500,3500,2100,300,3200,500,1375,1500,10000,19868,13200,12000,35000,2750,4500,2000,600,300,4000,2000,1200,3500,3200,2400,2200,2250,9000,1500,500,1400,1300,400,1500,4700,4500,1100,375,700,1000,1250,6000,2600,75,1250,700,1200,600,600,2850,10,7000,1500,8000,8000,8000,50,27000,75,50,750,6000,4800,150,3399,3750,7000,25,6500,302,3000,6200,500,7000,4000,400,800,800,1200,6000,1500,500,1300,1100,16000,12000,3000,50,550,33141,250,600,1100,1100,1000,76000,15000,750,1000,400,2250,2000,3800,9000,1061,250,1200,750,500,3000,13000,1000,700,1600,200,7000,600,2300,3000,1000,3200,900,1750,1300]} df1 = pd.DataFrame(dict1) dict2 = {'SYMBOL': ['INFY', 'TATAMOTORS', 'IDBI', 'BHEL', 'LT'], 'LTP': ['55', '66', '77', '88', '99'], 'PRICE': ['0.25', '0.36', '0.12', '0.28', '0.85']} df2 = pd.DataFrame(dict2) print(df1,'\n\n') print(df2,'\n\n') df2['LOT_SIZE']=df2[['SYMBOL']].merge(df1,how='left').LOT_SIZE print(df2)
Unable to scrape this site. How to scrape data from this site?
Iam not able to scrape data from this site. I tried with other sites but it's ok with other sites... from bs4 import BeautifulSoup from urllib.request import urlopen response = urlopen("https://www.daraz.com.np/catalog/?spm=a2a0e.searchlistcategory.search.2.3eac4b8amQJ0zd&q=samsung%20m20&_keyori=ss&from=suggest_normal&sugg=samsung%20m20_1_1") html = response.read() parsed_html = BeautifulSoup(html, "html.parser") containers = parsed_html.find_all("div", {"class" : "c2prKC"}) print(len(containers))
Look like JS render to page after loading .You can use Selenium to render the page and beautiful soup to get the element. from bs4 import BeautifulSoup from selenium import webdriver import time driver = webdriver.Chrome() driver.get("https://www.daraz.com.np/catalog/?spm=a2a0e.searchlistcategory.search.2.3eac4b8amQJ0zd&q=samsung%20m20&_keyori=ss&from=suggest_normal&sugg=samsung%20m20_1_1") time.sleep(5) html = driver.page_source parsed_html = BeautifulSoup(html, "html.parser") containers = parsed_html.find_all("div", {"class" : "c2prKC"}) print(len(containers))
Info you want is in a script tag. You can use regex or loop script tags to get the right string to parse as json (with a small amendment) import requests import json from bs4 import BeautifulSoup as bs import pandas as pd headers = { 'User-Agent' : 'Mozilla/5.0' } res = requests.get('https://www.daraz.com.np/catalog/?spm=a2a0e.searchlistcategory.search.2.3eac4b8amQJ0zd&q=samsung%20m20&_keyori=ss&from=suggest_normal&sugg=samsung%20m20_1_1', headers = headers) soup = bs(res.content, 'lxml') for script in soup.select('script'): if 'window.pageData=' in script.text: script = script.text.replace('window.pageData=','') break items = json.loads(script)['mods']['listItems'] results = [] for item in items: #print(item) #extract other info you want row = [item['name'], item['priceShow'], item['productUrl'], item['ratingScore']] results.append(row) df = pd.DataFrame(results, columns = ['Name', 'Price', 'ProductUrl', 'Rating']) print(df.head()) Regex version: import requests import json from bs4 import BeautifulSoup as bs import pandas as pd headers = { 'User-Agent' : 'Mozilla/5.0' } res = requests.get('https://www.daraz.com.np/catalog/?spm=a2a0e.searchlistcategory.search.2.3eac4b8amQJ0zd&q=samsung%20m20&_keyori=ss&from=suggest_normal&sugg=samsung%20m20_1_1', headers = headers) soup = bs(res.content, 'lxml') r = re.compile(r'window.pageData=(.*)') data = soup.find('script', text=r).text script = r.findall(data)[0] items = json.loads(script)['mods']['listItems'] results = [] for item in items: row = [item['name'], item['priceShow'], item['productUrl'], item['ratingScore']] results.append(row) df = pd.DataFrame(results, columns = ['Name', 'Price', 'ProductUrl', 'Rating']) print(df.head())
import requests import json from bs4 import BeautifulSoup as bs import pandas as pd import json headers = { 'User-Agent' : 'Mozilla/5.0' } res = requests.get('https://www.daraz.com.np/catalog/?q=camera&_keyori=ss&from=input&spm=a2a0e.searchlist.search.go.71a64360Kgxf1m', headers = headers) soup = bs(res.content, 'lxml') scriptData='' for d in containerSearch: if 'window.pageData=' in str(d): scriptData=str(d).replace('window.pageData=','') break scriptData=scriptData.replace('<script>','') scriptData=scriptData.replace('</script>','') items = json.loads(scriptData) name=items['mods']['listItems'][0]['name'] image=items['mods']['listItems'][0]['image'] price=items['mods']['listItems'][0]['price'] priceShow=items['mods']['listItems'][0]['priceShow'] ratingScore=items['mods']['listItems'][0]['ratingScore'] productUrl=items['mods']['listItems'][0]['productUrl'] print(name) print(price)