Iam not able to scrape data from this site.
I tried with other sites but it's ok with other sites...
from bs4 import BeautifulSoup
from urllib.request import urlopen
response = urlopen("https://www.daraz.com.np/catalog/?spm=a2a0e.searchlistcategory.search.2.3eac4b8amQJ0zd&q=samsung%20m20&_keyori=ss&from=suggest_normal&sugg=samsung%20m20_1_1")
html = response.read()
parsed_html = BeautifulSoup(html, "html.parser")
containers = parsed_html.find_all("div", {"class" : "c2prKC"})
print(len(containers))
Look like JS render to page after loading .You can use Selenium to render the page and beautiful soup to get the element.
from bs4 import BeautifulSoup
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get("https://www.daraz.com.np/catalog/?spm=a2a0e.searchlistcategory.search.2.3eac4b8amQJ0zd&q=samsung%20m20&_keyori=ss&from=suggest_normal&sugg=samsung%20m20_1_1")
time.sleep(5)
html = driver.page_source
parsed_html = BeautifulSoup(html, "html.parser")
containers = parsed_html.find_all("div", {"class" : "c2prKC"})
print(len(containers))
Info you want is in a script tag. You can use regex or loop script tags to get the right string to parse as json (with a small amendment)
import requests
import json
from bs4 import BeautifulSoup as bs
import pandas as pd
headers = {
'User-Agent' : 'Mozilla/5.0'
}
res = requests.get('https://www.daraz.com.np/catalog/?spm=a2a0e.searchlistcategory.search.2.3eac4b8amQJ0zd&q=samsung%20m20&_keyori=ss&from=suggest_normal&sugg=samsung%20m20_1_1', headers = headers)
soup = bs(res.content, 'lxml')
for script in soup.select('script'):
if 'window.pageData=' in script.text:
script = script.text.replace('window.pageData=','')
break
items = json.loads(script)['mods']['listItems']
results = []
for item in items:
#print(item)
#extract other info you want
row = [item['name'], item['priceShow'], item['productUrl'], item['ratingScore']]
results.append(row)
df = pd.DataFrame(results, columns = ['Name', 'Price', 'ProductUrl', 'Rating'])
print(df.head())
Regex version:
import requests
import json
from bs4 import BeautifulSoup as bs
import pandas as pd
headers = {
'User-Agent' : 'Mozilla/5.0'
}
res = requests.get('https://www.daraz.com.np/catalog/?spm=a2a0e.searchlistcategory.search.2.3eac4b8amQJ0zd&q=samsung%20m20&_keyori=ss&from=suggest_normal&sugg=samsung%20m20_1_1', headers = headers)
soup = bs(res.content, 'lxml')
r = re.compile(r'window.pageData=(.*)')
data = soup.find('script', text=r).text
script = r.findall(data)[0]
items = json.loads(script)['mods']['listItems']
results = []
for item in items:
row = [item['name'], item['priceShow'], item['productUrl'], item['ratingScore']]
results.append(row)
df = pd.DataFrame(results, columns = ['Name', 'Price', 'ProductUrl', 'Rating'])
print(df.head())
import requests
import json
from bs4 import BeautifulSoup as bs
import pandas as pd
import json
headers = {
'User-Agent' : 'Mozilla/5.0'
}
res = requests.get('https://www.daraz.com.np/catalog/?q=camera&_keyori=ss&from=input&spm=a2a0e.searchlist.search.go.71a64360Kgxf1m', headers = headers)
soup = bs(res.content, 'lxml')
scriptData=''
for d in containerSearch:
if 'window.pageData=' in str(d):
scriptData=str(d).replace('window.pageData=','')
break
scriptData=scriptData.replace('<script>','')
scriptData=scriptData.replace('</script>','')
items = json.loads(scriptData)
name=items['mods']['listItems'][0]['name']
image=items['mods']['listItems'][0]['image']
price=items['mods']['listItems'][0]['price']
priceShow=items['mods']['listItems'][0]['priceShow']
ratingScore=items['mods']['listItems'][0]['ratingScore']
productUrl=items['mods']['listItems'][0]['productUrl']
print(name)
print(price)
Related
I got stuck on extracting names and links it doesn't any response but it prints prices.
link from where I scraping is: https://sehat.com.pk/categories/Over-The-Counter-Drugs/Diarrhea-and-Vomiting-/
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
url = 'https://sehat.com.pk/categories/Over-The-Counter-Drugs/Diarrhea-and-Vomiting-/'
r = requests.get(url)
time.sleep(6)
soup = BeautifulSoup(r.content, 'html.parser')
content = soup.find_all('div', class_ = 'col-md-12 pr-0 pl-0')
for property in content:
links = property.find('div',{'class': 'col-md-12 d-table-cell align-middle'})['href']
name= property.find('img', class_ = 'img-fluid').text.strip()
price= property.find('div', class_ = 'ProductPriceRating d-table-cell text-center pl-1 pr-1 align-middle').text.strip()
print(name,links,price)
You can try like this
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import csv
url = 'https://sehat.com.pk/categories/Over-The-Counter-Drugs/Diarrhea-and-Vomiting-/'
r = requests.get(url)
# time.sleep(6)
soup = BeautifulSoup(r.content, 'html.parser')
content = soup.find_all('div', class_ = 'col-md-12 pr-0 pl-0')
# print(content)
header = ['url', 'item', 'price']
data = []
for property in content:
link =[i['href'] for i in property.findAll("a")][0]
title = [i.getText(strip=True) for i in property.find_all("a")][1]
price = [i.getText(strip=True) for i in property.find_all('div',{'class':"ProductPriceRating"})][0]
data.append([link, title, price])
print(data)
df = pd.DataFrame(data, columns=header)
df.to_csv("products.csv")
I'm trying to use BeautifulSoup4 in Orange to scrape data from a list of URLs scraped from that same website.
I have managed to scraped the data from a single page when I set the URL manually.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import csv
import re
url = "https://data.ushja.org/awards-standings/zone-points.aspx?year=2021&zone=1§ion=1901"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
rank = soup.find("table", class_="table-standings-body")
for child in rank.children:
print(url,child)
and I have been able to scrape the list of URLs I need
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import csv
import re
url = "https://data.ushja.org/awards-standings/zones.aspx?year=2021&zone=1"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
rank = soup.find("table", class_="table-standings-body")
link = soup.find('div',class_='contentSection')
url_list = link.find('a').get('href')
for url_list in link.find_all('a'):
print (url_list.get('href'))
But so far I haven't been able to combine both to scrape the data from that URL list. Can I do that only by nesting for loops, and if so, how? Or how can I do it?
I am sorry if this is a stupid question, but I only started trying with Python and Web-Scraping yesterday and I have not been able to figure this by consulting similar-ish topics.
Try:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://data.ushja.org/awards-standings/zones.aspx?year=2021&zone=1"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
# get all links
url_list = []
for a in soup.find("div", class_="contentSection").find_all("a"):
url_list.append(a["href"].replace("§", "§"))
# get all data from URLs
all_data = []
for url in url_list:
print(url)
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
h2 = soup.h2
sub = h2.find_next("p")
for tr in soup.select("tr:has(td)"):
all_data.append(
[
h2.get_text(strip=True),
sub.get_text(strip=True),
*[td.get_text(strip=True) for td in tr.select("td")],
]
)
# save data to CSV
df = pd.DataFrame(
all_data,
columns=[
"title",
"sub_title",
"Rank",
"Horse / Owner",
"Points",
"Total Comps",
],
)
print(df)
df.to_csv("data.csv", index=None)
This traverses all URLs and saves all data to data.csv (screenshot from LibreOffice):
User Chrisvdberge helped me creating the following code :
import pandas as pd
import requests
from bs4 import BeautifulSoup
url_DAX = 'https://www.eurexchange.com/exchange-en/market-data/statistics/market-statistics-online/100!onlineStats?viewType=4&productGroupId=13394&productId=34642&cp=&month=&year=&busDate=20191114'
req = requests.get(url_DAX, verify=False)
html = req.text
soup = BeautifulSoup(html, 'lxml')
df = pd.read_html(str(html))[0]
df.to_csv('results_DAX.csv')
print(df)
url_DOW = 'https://www.cmegroup.com/trading/equity-index/us-index/e-mini-dow_quotes_settlements_futures.html'
req = requests.get(url_DOW, verify=False)
html = req.text
soup = BeautifulSoup(html, 'lxml')
df = pd.read_html(str(html))[0]
df.to_csv('results_DOW.csv')
print(df)
url_NASDAQ = 'https://www.cmegroup.com/trading/equity-index/us-index/e-mini-nasdaq-100_quotes_settlements_futures.html'
req = requests.get(url_NASDAQ, verify=False)
html = req.text
soup = BeautifulSoup(html, 'lxml')
df = pd.read_html(str(html))[0]
df.to_csv('results_NASDAQ.csv')
print(df)
url_CAC = 'https://live.euronext.com/fr/product/index-futures/FCE-DPAR/settlement-prices'
req = requests.get(url_CAC, verify=False)
html = req.text
soup = BeautifulSoup(html, 'lxml')
df = pd.read_html(str(html))[0]
df.to_csv('results_CAC.csv')
print(df)
I have the following result :
3 .csv files are created : results_DAX.csv (here, everything is ok, I have the values I want.) ; results_DOW.csv and results_NASDAQ.csv (here, the problem is that the .csv files don't have the wanted values.. I don't understand why ?)
As you can see in the code, 4 files should be created and not only 3.
So my questions are :
How to get 4 csv files ?
How to get values in the results_DOW.csv and in the results_NASDAQ.csv files ? (and maybe also in the results_CAC.csv file)
Thank you for your answers ! :)
Try this to get those other sites. The last site is a little trickier, so you'd need to try out Selenium:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta
url_DAX = 'https://www.eurexchange.com/exchange-en/market-data/statistics/market-statistics-online/100!onlineStats?viewType=4&productGroupId=13394&productId=34642&cp=&month=&year=&busDate=20191114'
df = pd.read_html(url_DAX)[0]
df.to_csv('results_DAX.csv')
print(df)
dt = date.today() - timedelta(days=2)
dateParam = dt.strftime('%m/%d/%Y')
url_DOW = 'https://www.cmegroup.com/CmeWS/mvc/Settlements/Futures/Settlements/318/FUT'
payload = {
'tradeDate': dateParam,
'strategy': 'DEFAULT',
'pageSize': '500',
'_': '1573920502874'}
response = requests.get(url_DOW, params=payload).json()
df = pd.DataFrame(response['settlements'])
df.to_csv('results_DOW.csv')
print(df)
url_NASDAQ = 'https://www.cmegroup.com/CmeWS/mvc/Settlements/Futures/Settlements/146/FUT'
payload = {
'tradeDate': dateParam,
'strategy': 'DEFAULT',
'pageSize': '500',
'_': '1573920650587'}
response = requests.get(url_NASDAQ, params=payload).json()
df = pd.DataFrame(response['settlements'])
df.to_csv('results_NASDAQ.csv')
print(df)
I tested my code with jupiter notebook with this code
...
rname = soup.find('p', 'con_tx')
#rnamelis = rname.findAll('p')
rname
from urllib.request import urljoin
story=[]
#review_text = lis[0].find('p').getText()
#list_soup =soup.find_all('p', 'con_tx')
story=rname.getText()
story
and it worked well.
(result) '전 여친에 ...'
But when I tried to scrape multiple pages
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import urljoin
import pandas as pd
import numpy as np
import requests
base_url = 'https://movie.naver.com/movie/bi/mi/basic.nhn?code='
pages =['177374','164102']
url = base_url + pages[0]
story = []
for n in pages:
# Create url
url = base_url + n
# Parse data using BS
print('Downloading page %s...' % url)
res = requests.get(url)
res.raise_for_status()
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
#print(soup.find('p', 'con_tx'))
rname = soup.find('p', 'con_tx')
story=rname.getText()
data = {story}
df = pd.DataFrame(data)
df.head()
df.to_csv('./moviestory.csv', sep=',', encoding='EUC-KR')
An error message came out.
ValueError: DataFrame constructor not properly called!
How do I fix my code?
Crawling area
Not sure what you are trying to do, but one thing I'm noticing is you are overwriting your dataframe each time. Also don;t know why you initialise story as a list, and then store it as a dictionary in the loop.
from bs4 import BeautifulSoup
import pandas as pd
import requests
base_url = 'https://movie.naver.com/movie/bi/mi/basic.nhn?code='
pages =['177374','164102']
df = pd.DataFrame()
for n in pages:
# Create url
url = base_url + n
# Parse data using BS
print('Downloading page %s...' % url)
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
rname = soup.find('p', 'con_tx')
story=rname.getText()
data = [story]
df = df.append(pd.DataFrame(data), sort=True).reset_index(drop=True)
df.to_csv('./moviestory.csv', sep=',', encoding='EUC-KR')
I would like to get the value 100 from the tag below using python and beautiful soup
<span style="font-size:90%"><b>100</b> <cite style="color:#cc0000"><b>-0.10</b> (0.52%)</cite></span>
The code below gives me the following output
100 -0.10 (0.52%)
How can I extract only the value 100?
Code:
from urllib.request import Request, urlopen
import bs4
import re
url = 'url.com'
req = Request(url, headers = {'User-Agent': 'Mozilla/5.0'})
page = urlopen(req).read()
soup = bs4.BeautifulSoup(page, 'html.parser')
data = soup.find('span',style=re.compile('font-size:90%'))
value = data.text
You can get the first element of soup.contents:
from bs4 import BeautifulSoup as soup
d = soup(page, 'html.parser').find('span', {'style':'font-size:90%'}).contents[0].text
Output:
'100'
Just Find the <b> tag it will give you 100.
data = soup.find('span',style=re.compile('font-size:90%'))
value = data.find('b').text