Why does BeautifulSoup fail to extract data from websites to csv?

Why does BeautifulSoup fail to extract data from websites to csv? - python

User Chrisvdberge helped me creating the following code :
import pandas as pd
import requests
from bs4 import BeautifulSoup
url_DAX = 'https://www.eurexchange.com/exchange-en/market-data/statistics/market-statistics-online/100!onlineStats?viewType=4&productGroupId=13394&productId=34642&cp=&month=&year=&busDate=20191114'
req = requests.get(url_DAX, verify=False)
html = req.text
soup = BeautifulSoup(html, 'lxml')
df = pd.read_html(str(html))[0]
df.to_csv('results_DAX.csv')
print(df)
url_DOW = 'https://www.cmegroup.com/trading/equity-index/us-index/e-mini-dow_quotes_settlements_futures.html'
req = requests.get(url_DOW, verify=False)
html = req.text
soup = BeautifulSoup(html, 'lxml')
df = pd.read_html(str(html))[0]
df.to_csv('results_DOW.csv')
print(df)
url_NASDAQ = 'https://www.cmegroup.com/trading/equity-index/us-index/e-mini-nasdaq-100_quotes_settlements_futures.html'
req = requests.get(url_NASDAQ, verify=False)
html = req.text
soup = BeautifulSoup(html, 'lxml')
df = pd.read_html(str(html))[0]
df.to_csv('results_NASDAQ.csv')
print(df)
url_CAC = 'https://live.euronext.com/fr/product/index-futures/FCE-DPAR/settlement-prices'
req = requests.get(url_CAC, verify=False)
html = req.text
soup = BeautifulSoup(html, 'lxml')
df = pd.read_html(str(html))[0]
df.to_csv('results_CAC.csv')
print(df)
I have the following result :
3 .csv files are created : results_DAX.csv (here, everything is ok, I have the values I want.) ; results_DOW.csv and results_NASDAQ.csv (here, the problem is that the .csv files don't have the wanted values.. I don't understand why ?)
As you can see in the code, 4 files should be created and not only 3.
So my questions are :
How to get 4 csv files ?
How to get values in the results_DOW.csv and in the results_NASDAQ.csv files ? (and maybe also in the results_CAC.csv file)
Thank you for your answers ! :)

Try this to get those other sites. The last site is a little trickier, so you'd need to try out Selenium:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta
url_DAX = 'https://www.eurexchange.com/exchange-en/market-data/statistics/market-statistics-online/100!onlineStats?viewType=4&productGroupId=13394&productId=34642&cp=&month=&year=&busDate=20191114'
df = pd.read_html(url_DAX)[0]
df.to_csv('results_DAX.csv')
print(df)
dt = date.today() - timedelta(days=2)
dateParam = dt.strftime('%m/%d/%Y')
url_DOW = 'https://www.cmegroup.com/CmeWS/mvc/Settlements/Futures/Settlements/318/FUT'
payload = {
'tradeDate': dateParam,
'strategy': 'DEFAULT',
'pageSize': '500',
'_': '1573920502874'}
response = requests.get(url_DOW, params=payload).json()
df = pd.DataFrame(response['settlements'])
df.to_csv('results_DOW.csv')
print(df)
url_NASDAQ = 'https://www.cmegroup.com/CmeWS/mvc/Settlements/Futures/Settlements/146/FUT'
payload = {
'tradeDate': dateParam,
'strategy': 'DEFAULT',
'pageSize': '500',
'_': '1573920650587'}
response = requests.get(url_NASDAQ, params=payload).json()
df = pd.DataFrame(response['settlements'])
df.to_csv('results_NASDAQ.csv')
print(df)

Related

Webscraping - html issue

Ive been trying this code and getting some success but cannot figure out the next step
import pandas as pd
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pprint
import json
url2 = "https://www.procyclingstats.com/rankings.php"
print(colored('#Step1','green'))
response = requests.get(url2)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', {'class':'basic'})
headers = [heading.text for heading in table.find_all('th',{"class":"cu600"})]
#print(headers)
#why do I only get two headers here (prev, team)?
response = requests.get(url2)
dfs = pd.read_html(response.text)[0]
#print(list(dfs))
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
# print(dfs)
print(colored('#Step1','red'))
print(colored('#Step2','green'))
url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, 'lxml')
table2 = soup.find({'class':'class="mt10 pps"'})
#headers = [heading.text for heading in table1.find_all('th',{"class":"cu600"})]
#print(headers)
# Usually the line below is enough
# But for some reason returning Forbidden
#dfs = pd.read_html(url)[0]
response = requests.get(url3)
dfs2 = pd.read_html(response.text)[0]
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
#print(dfs2)
child_soup = soup.find('h3')
for i in child_soup.children:
print("child : ", i)
print('\n'*3)
I end up with the child as Rider (result text below)
Ive been trying this code and getting some success but cannot figure out the next step
import pandas as pd
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pprint
import json
url2 = "https://www.procyclingstats.com/rankings.php"
print(colored('#Step1','green'))
response = requests.get(url2)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', {'class':'basic'})
headers = [heading.text for heading in table.find_all('th',{"class":"cu600"})]
#print(headers)
#why do I only get two headers here (prev, team)?
response = requests.get(url2)
dfs = pd.read_html(response.text)[0]
#print(list(dfs))
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
# print(dfs)
print(colored('#Step1','red'))
print(colored('#Step2','green'))
url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, 'lxml')
table2 = soup.find({'class':'class="mt10 pps"'})
#headers = [heading.text for heading in table1.find_all('th',{"class":"cu600"})]
#print(headers)
# Usually the line below is enough
# But for some reason returning Forbidden
#dfs = pd.read_html(url)[0]
response = requests.get(url3)
dfs2 = pd.read_html(response.text)[0]
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
#print(dfs2)
child_soup = soup.find('h3')
for i in child_soup.children:
print("child : ", i)
print('\n'*3)
I end up with the child as Rider (result text below)
#Step2
child : Rider
What Im trying to capture is the 'points per speciality' and the values.
There is a second question as to why I only get two tags when they all appear to have the same name?
Photo and arrow showing desired result

import re
url2 = "https://www.procyclingstats.com/rankings.php"
response = requests.get(url2)
soup = BeautifulSoup(response.text, "lxml")
table = soup.find("table", {"class": "basic"})
thead = table.find("thead")
headers = [heading.text for heading in thead.find_all("th")]
url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, "lxml")
ul = soup.find("ul", {"class": "basic"})
li = ul.find_all("li")
d = {}
for l in li:
m = re.search("(\d+)(.*)", l.text)
d[m.group(2)] = m.group(1)
print(headers)
print(d)
# ['#', 'Prev.', 'Diff.', 'Rider', 'Team', 'Points']
# {'One day races': '1641', 'GC': '3444', 'Time trial': '1147', 'Sprint': '302', 'Climber': '3816'}

Turning find_all() results into text so it can be usable in a pandas dataframe

I need to scrape content that is inside a div class inside another div class which repeats so I needed to use a find_all to get them. I want to be able to get them in text so when I put them in a dataframe it says the name of the object inside as you would get when you do a find(...).text instead of the entire html line
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.grammy.com/grammys/awards/winners-nominees/138'
page = requests.get(url).text
soup = BeautifulSoup(page,'lxml')
category = soup.find_all('div', class_ = "view-grouping-content")
print(len(category))
for c in category:
artistName = c.find_all('div', class_ = "views-field views-field-field-description")

import requests
from bs4 import BeautifulSoup
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
goal = [x.text for x in soup.select(
'.freelink.freelink-nid.freelink-internal')]
print(goal)
main('https://www.grammy.com/grammys/awards/winners-nominees/138')

Another angle of attack (if this could be from another site)...
import pandas as pd
import requests
url = r'https://en.wikipedia.org/wiki/Grammy_Award_for_Record_of_the_Year'
page = requests.get(url)
tables = pd.read_html(page.text)
df = pd.concat(tables[1:9])
df.dropna(thresh=3, inplace=True)
df = df.rename(columns={'Year[I]':'Year'})
df['Year'] = df['Year'].str.replace('\[\d+\]', '', regex=True)
df['Record'] = df['Record'].str.replace('"', '', regex=False)
print(df)
Outputs:

Data Scrape Output into Dataframe

Hello Everyone I have Scraped this Information from a JobListing site so far. Everything seems to work well however I am struggling to get this information into a data frame with headers and everything. Any Help is appreciated.
My Full code is:
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='ResultsContainer')
python_jobs = results.find_all('h2',string=lambda text: 'test' in text.lower())
for p_job in python_jobs:
link = p_job.find('a')['href']
print(p_job.text.strip())
print(f"Apply Here: {link}")
job_elems = results.find_all('section', class_= 'card-content')
for job_elem in job_elems:
title_elem = job_elem.find('h2', class_='title')
company_elem = job_elem.find('div', class_='company')
location_elem = job_elem.find('div', class_='location')
if None in (title_elem, company_elem, location_elem):
continue
print(title_elem.text.strip())
print(company_elem.text.strip())
print(location_elem.text.strip())
print()
Not sure how to approach this.

use concat() for all columns and then append() to one dataframe in loop
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='ResultsContainer')
python_jobs = results.find_all('h2',string=lambda text: 'test' in text.lower())
for p_job in python_jobs:
link = p_job.find('a')['href']
print(p_job.text.strip())
print(f"Apply Here: {link}")
job_elems = results.find_all('section', class_= 'card-content')
df= pd.DataFrame()
for job_elem in job_elems:
title_elem = job_elem.find('h2', class_='title')
company_elem = job_elem.find('div', class_='company')
location_elem = job_elem.find('div', class_='location')
if None in (title_elem, company_elem, location_elem):
continue
df1=pd.concat([pd.Series(title_elem.text.strip()),
pd.Series(company_elem.text.strip()),
pd.Series(location_elem.text.strip())],axis=1)
df=df.append(df1)
print(df)

You can save the job details (i.e, title, company, and location) in a dictionary, then dataframe the dictionary.
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='ResultsContainer')
python_jobs = results.find_all('h2',string=lambda text: 'test' in text.lower())
for p_job in python_jobs:
link = p_job.find('a')['href']
print(p_job.text.strip())
print(f"Apply Here: {link}")
job_elems = results.find_all('section', class_= 'card-content')
i = 1
my_job_list = {}
for job_elem in job_elems:
title_elem = job_elem.find('h2', class_='title')
company_elem = job_elem.find('div', class_='company')
location_elem = job_elem.find('div', class_='location')
if None in (title_elem, company_elem, location_elem):
continue
op = f'opening {i}'
my_job_list[op] = {'position':title_elem.text.strip(), 'company':
company_elem.text.strip(), 'location': location_elem.text.strip()}
i= i+1
print(title_elem.text.strip())
print(company_elem.text.strip())
print(location_elem.text.strip())
df = pd.DataFrame(my_job_list)
print(df)

Get value from web link

I have a url from where I want to extract the line having data as "Underlying Stock: NCC 96.70 As on Jun 06, 2019 10:12:20 IST" and extract the Symbol which is "NCC" and Underlying Price is "96.70" into a list.
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"

You can make a request to the site and then parse the result with Beautiful Soup.
Try this:
from bs4 import BeautifulSoup
import requests
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"
res = requests.get(url)
soup = BeautifulSoup(res.text)
# hacky way of finding and parsing the stock data
soup.get_text().split("Underlying Stock")[1][2:10].split(" ")
This prints out:
['NCC', '96.9']
PS: If you get a warning about lxml... It is the default parser given that you have installed it. Change this line then: soup = BeautifulSoup(res.text, features="lxml"). You need to have lxml installed e.g. with pip install lxml in your environment.

Another version, less hacky.
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"
page_html = requests.get(url).text
page_soup = BeautifulSoup(page_html, "html.parser")
page_soup.find("b").next.split(' ')

A succinct way is to select for the first right aligned table cell (td[align=right]) ; which you can actually simplify to just the attribute, [align=right]:
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17')
soup = bs(r.content, 'lxml')
headline = soup.select_one('[align=right]').text.strip().replace('\xa0\n',' ')
print(headline)
You can also take first row of first table
from bs4 import BeautifulSoup
import requests
r = requests.get('https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17')
soup = bs(r.content, 'lxml')
table = soup.select_one('table')
headline = table.select_one('tr:nth-of-type(1)').text.replace('\n',' ').replace('\xa0', ' ').strip()
print(headline)

from bs4 import BeautifulSoup
import requests
url = "https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=917&symbol=NCC&symbol=ncc&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17"
res = requests.get(url)
soup = BeautifulSoup(res.text, "lxml")
# hacky way of finding and parsing the stock data
mylist = soup.get_text().split("Underlying Stock")[1][2:10].split(" ")
print(mylist[:2])
=============================
import pandas as pd
dict1 = {'SYMBOL': ['ACC','ADANIENT','ADANIPORTS','ADANIPOWER','AJANTPHARM','ALBK','AMARAJABAT','AMBUJACEM','APOLLOHOSP','APOLLOTYRE','ARVIND','ASHOKLEY','ASIANPAINT','AUROPHARMA','AXISBANK','BAJAJ-AUTO','BAJAJFINSV','BAJFINANCE','BALKRISIND','BANKBARODA','BANKINDIA','BANKNIFTY','BATAINDIA','BEL','BEML','BERGEPAINT','BHARATFIN','BHARATFORG','BHARTIARTL','BHEL','BIOCON','BOSCHLTD','BPCL','BRITANNIA','BSOFT','CADILAHC','CANBK','CANFINHOME','CASTROLIND','CEATLTD','CENTURYTEX','CESC','CGPOWER','CHENNPETRO','CHOLAFIN','CIPLA','COALINDIA','COLPAL','CONCOR','CUMMINSIND','DABUR','DCBBANK','DHFL','DISHTV','DIVISLAB','DLF','DRREDDY','EICHERMOT','ENGINERSIN','EQUITAS','ESCORTS','EXIDEIND','FEDERALBNK','GAIL','GLENMARK','GMRINFRA','GODFRYPHLP','GODREJCP','GODREJIND','GRASIM','GSFC','HAVELLS','HCLTECH','HDFC','HDFCBANK','HEROMOTOCO','HEXAWARE','HINDALCO','HINDPETRO','HINDUNILVR','HINDZINC','IBULHSGFIN','ICICIBANK','ICICIPRULI','IDBI','IDEA','IDFC','IDFCFIRSTB','IFCI','IGL','INDIACEM','INDIANB','INDIGO','INDUSINDBK','INFIBEAM','INFRATEL','INFY','IOC','IRB','ITC','JETAIRWAYS','JINDALSTEL','JISLJALEQS','JSWSTEEL','JUBLFOOD','JUSTDIAL','KAJARIACER','KOTAKBANK','KSCL','KTKBANK','L&TFH','LICHSGFIN','LT','LUPIN','M&M','M&MFIN','MANAPPURAM','MARICO','MARUTI','MCDOWELL-N','MCX','MFSL','MGL','MINDTREE','MOTHERSUMI','MRF','MRPL','MUTHOOTFIN','NATIONALUM','NBCC','NCC','NESTLEIND','NHPC','NIFTY','NIFTYIT','NIITTECH','NMDC','NTPC','OFSS','OIL','ONGC','ORIENTBANK','PAGEIND','PCJEWELLER','PEL','PETRONET','PFC','PIDILITIND','PNB','POWERGRID','PVR','RAMCOCEM','RAYMOND','RBLBANK','RECLTD','RELCAPITAL','RELIANCE','RELINFRA','REPCOHOME','RPOWER','SAIL','SBIN','SHREECEM','SIEMENS','SOUTHBANK','SRF','SRTRANSFIN','STAR','SUNPHARMA','SUNTV','SUZLON','SYNDIBANK','TATACHEM','TATACOMM','TATAELXSI','TATAGLOBAL','TATAMOTORS','TATAMTRDVR','TATAPOWER','TATASTEEL','TCS','TECHM','TITAN','TORNTPHARM','TORNTPOWER','TV18BRDCST','TVSMOTOR','UBL','UJJIVAN','ULTRACEMCO','UNIONBANK','UPL','VEDL','VGUARD','VOLTAS','WIPRO','WOCKPHARMA','YESBANK','ZEEL'],
'LOT_SIZE': [400,4000,2500,20000,500,13000,700,2500,500,3000,2000,4000,600,1000,1200,250,125,250,800,4000,6000,20,550,6000,700,2200,500,1200,1851,7500,900,30,1800,200,2250,1600,2000,1800,3400,400,600,550,12000,1800,500,1000,2200,700,1563,700,1250,4500,1500,8000,400,2600,250,25,4100,4000,1100,2000,7000,2667,1000,45000,700,600,1500,750,4700,1000,700,500,250,200,1500,3500,2100,300,3200,500,1375,1500,10000,19868,13200,12000,35000,2750,4500,2000,600,300,4000,2000,1200,3500,3200,2400,2200,2250,9000,1500,500,1400,1300,400,1500,4700,4500,1100,375,700,1000,1250,6000,2600,75,1250,700,1200,600,600,2850,10,7000,1500,8000,8000,8000,50,27000,75,50,750,6000,4800,150,3399,3750,7000,25,6500,302,3000,6200,500,7000,4000,400,800,800,1200,6000,1500,500,1300,1100,16000,12000,3000,50,550,33141,250,600,1100,1100,1000,76000,15000,750,1000,400,2250,2000,3800,9000,1061,250,1200,750,500,3000,13000,1000,700,1600,200,7000,600,2300,3000,1000,3200,900,1750,1300]}
df1 = pd.DataFrame(dict1)
dict2 = {'SYMBOL': ['INFY', 'TATAMOTORS', 'IDBI', 'BHEL', 'LT'],
'LTP': ['55', '66', '77', '88', '99'],
'PRICE': ['0.25', '0.36', '0.12', '0.28', '0.85']}
df2 = pd.DataFrame(dict2)
print(df1,'\n\n')
print(df2,'\n\n')
df2['LOT_SIZE']=df2[['SYMBOL']].merge(df1,how='left').LOT_SIZE
print(df2)

Unable to scrape this site. How to scrape data from this site?

Iam not able to scrape data from this site.
I tried with other sites but it's ok with other sites...
from bs4 import BeautifulSoup
from urllib.request import urlopen
response = urlopen("https://www.daraz.com.np/catalog/?spm=a2a0e.searchlistcategory.search.2.3eac4b8amQJ0zd&q=samsung%20m20&_keyori=ss&from=suggest_normal&sugg=samsung%20m20_1_1")
html = response.read()
parsed_html = BeautifulSoup(html, "html.parser")
containers = parsed_html.find_all("div", {"class" : "c2prKC"})
print(len(containers))

Look like JS render to page after loading .You can use Selenium to render the page and beautiful soup to get the element.
from bs4 import BeautifulSoup
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get("https://www.daraz.com.np/catalog/?spm=a2a0e.searchlistcategory.search.2.3eac4b8amQJ0zd&q=samsung%20m20&_keyori=ss&from=suggest_normal&sugg=samsung%20m20_1_1")
time.sleep(5)
html = driver.page_source
parsed_html = BeautifulSoup(html, "html.parser")
containers = parsed_html.find_all("div", {"class" : "c2prKC"})
print(len(containers))

Info you want is in a script tag. You can use regex or loop script tags to get the right string to parse as json (with a small amendment)
import requests
import json
from bs4 import BeautifulSoup as bs
import pandas as pd
headers = {
'User-Agent' : 'Mozilla/5.0'
}
res = requests.get('https://www.daraz.com.np/catalog/?spm=a2a0e.searchlistcategory.search.2.3eac4b8amQJ0zd&q=samsung%20m20&_keyori=ss&from=suggest_normal&sugg=samsung%20m20_1_1', headers = headers)
soup = bs(res.content, 'lxml')
for script in soup.select('script'):
if 'window.pageData=' in script.text:
script = script.text.replace('window.pageData=','')
break
items = json.loads(script)['mods']['listItems']
results = []
for item in items:
#print(item)
#extract other info you want
row = [item['name'], item['priceShow'], item['productUrl'], item['ratingScore']]
results.append(row)
df = pd.DataFrame(results, columns = ['Name', 'Price', 'ProductUrl', 'Rating'])
print(df.head())
Regex version:
import requests
import json
from bs4 import BeautifulSoup as bs
import pandas as pd
headers = {
'User-Agent' : 'Mozilla/5.0'
}
res = requests.get('https://www.daraz.com.np/catalog/?spm=a2a0e.searchlistcategory.search.2.3eac4b8amQJ0zd&q=samsung%20m20&_keyori=ss&from=suggest_normal&sugg=samsung%20m20_1_1', headers = headers)
soup = bs(res.content, 'lxml')
r = re.compile(r'window.pageData=(.*)')
data = soup.find('script', text=r).text
script = r.findall(data)[0]
items = json.loads(script)['mods']['listItems']
results = []
for item in items:
row = [item['name'], item['priceShow'], item['productUrl'], item['ratingScore']]
results.append(row)
df = pd.DataFrame(results, columns = ['Name', 'Price', 'ProductUrl', 'Rating'])
print(df.head())

import requests
import json
from bs4 import BeautifulSoup as bs
import pandas as pd
import json
headers = {
'User-Agent' : 'Mozilla/5.0'
}
res = requests.get('https://www.daraz.com.np/catalog/?q=camera&_keyori=ss&from=input&spm=a2a0e.searchlist.search.go.71a64360Kgxf1m', headers = headers)
soup = bs(res.content, 'lxml')
scriptData=''
for d in containerSearch:
if 'window.pageData=' in str(d):
scriptData=str(d).replace('window.pageData=','')
break
scriptData=scriptData.replace('<script>','')
scriptData=scriptData.replace('</script>','')
items = json.loads(scriptData)
name=items['mods']['listItems'][0]['name']
image=items['mods']['listItems'][0]['image']
price=items['mods']['listItems'][0]['price']
priceShow=items['mods']['listItems'][0]['priceShow']
ratingScore=items['mods']['listItems'][0]['ratingScore']
productUrl=items['mods']['listItems'][0]['productUrl']
print(name)
print(price)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Why does BeautifulSoup fail to extract data from websites to csv? - python

Related

Webscraping - html issue

Turning find_all() results into text so it can be usable in a pandas dataframe

Data Scrape Output into Dataframe

Get value from web link

Unable to scrape this site. How to scrape data from this site?

Categories

Resources