How to scrape address (comma separated text) using Beautifulsoup in python - python

I am trying to scrape address from the below link:
https://www.yelp.com/biz/rollin-phatties-houston
But I am getting only the first value of the address (i.e.: 1731 Westheimer Rd) out of complete address which is separated by a comma:
1731 Westheimer Rd, Houston, TX 77098
Can anyone help me out in this, please find my code below:
import bs4 as bs
import urllib.request as url
source = url.urlopen('https://www.yelp.com/biz/rollin-phatties-houston')
soup = bs.BeautifulSoup(source, 'html.parser')
mains = soup.find_all("div", {"class": "secondaryAttributes__09f24__3db5x arrange-unit__09f24__1gZC1 border-color--default__09f24__R1nRO"})
main = mains[0] #First item of mains
address = []
for main in mains:
try:
address.append(main.address.find("p").text)
except:
address.append("")
print(address)
# 1731 Westheimer Rd

import requests
import re
from ast import literal_eval
def main(url):
r = requests.get(url)
match = literal_eval(
re.search(r'addressLines.+?(\[.+?])', r.text).group(1))
print(*match)
main('https://www.yelp.com/biz/rollin-phatties-houston')
Output:
1731 Westheimer Rd Houston, TX 77098

There is no need to find the address information by inspecting the element, actually, the data inside a javascript tag element is passed onto the page already. You can get it by the following code
import chompjs
import bs4 as bs
import urllib.request as url
source = url.urlopen('https://www.yelp.com/biz/rollin-phatties-houston')
soup = bs.BeautifulSoup(source, 'html.parser')
javascript = soup.select("script")[16].string
data = chompjs.parse_js_object(javascript)
data['bizDetailsPageProps']['bizContactInfoProps']['businessAddress']

The business address that is shown on the webpage is generated dynamically. If you view Page Source of the URL, you will find that the address of the restaurant is stored in a script element. So you need to extract the address from it.
from bs4 import BeautifulSoup
import requests
import json
page = requests.get('https://www.yelp.com/biz/rollin-phatties-houston')
htmlpage = BeautifulSoup(page.text, 'html.parser')
scriptelements = htmlpage.find_all('script', attrs={'type':'application/json'})
scriptcontent = scriptelements[2].text
scriptcontent = scriptcontent.replace('<!--', '')
scriptcontent = scriptcontent.replace('-->', '')
jsondata = json.loads(scriptcontent)
print(jsondata['bizDetailsPageProps']['bizContactInfoProps']['businessAddress'])
Using the above code, you will be able to extract the address of any business.

Related

lxml to grab All items that share a certain xpath

I'm trying to grab all prices from a website, using the xpath. all prices have the same xpath, and only [0], or I assume the 1st item works... let me show you:
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[1]/div[5]/div/div/div/div[1]/ul/li[1]/article/div[1]/div[2]/div')[0].text)
This successfully prints the 1st price!!!
I tried changing "[0].text" to 1, to print the 2nd item but it returned "out of range".
Then I was trying to think of some For loop that would print All Items, so I could create an average.
Any help would be Greatly appreciated!!!
I apologize edited in is the code
from bs4 import BeautifulSoup
from lxml import etree
import requests
URL = "https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709"
#HEADERS = you'll need to add your own headers here, won't let post.
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[10]/div[4]/section/div/div/div[2]/div/div/div/div[2]/div/div[2]/div[2]/div[1]/div/div[2]/ul/li[3]/strong')[0].text)
You could just use css selectors which, in this instance, are a lot more readable. I would also remove some of the offers info to leave just the actual price.
import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = {}
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices[i.select_one('.item-title').text] = i.select_one('.price-current').get_text(strip=True)[:-1]
pprint(prices)
prices as list of floats
import requests, re
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = []
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices.append(float(re.sub('\$|,', '', i.select_one('.price-current').get_text(strip=True)[:-1])))
pprint(prices)

Using multiple for loop with Python Using Beautiful Soup

from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
url = "https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
data = requests.get(url)
soup = bs(data.content,"html.parser")
The code below are a test with to get 1 item.
property_overview = soup.find(class_="p24_regularListing").find(class_="p24_propertyOverview").find(class_='p24_propertyOverviewRow').find(class_='col-xs-6 p24_propertyOverviewKey').text
property_overview
Output : 'Listing Number'
The code below is what we have to get all the col-xs-6 p24_propertyOverviewKey
p24_regularListing_items = soup.find_all(class_="p24_regularListing")
for p24_propertyOverview_item in p24_regularListing_items:
p24_propertyOverview_items = p24_propertyOverview_item.find_all(class_="p24_propertyOverview")
for p24_propertyOverviewRow_item in p24_propertyOverview_items:
p24_propertyOverviewRow_items = p24_propertyOverviewRow_item.find_all(class_="p24_propertyOverviewRow")
for p24_propertyOverviewKey_item in p24_propertyOverviewRow_items:
p24_propertyOverviewKey_items = p24_propertyOverviewKey_item.find_all(class_="col-xs-6 p24_propertyOverviewKey")
p24_propertyOverviewKey_items
The code above only outputs 1 item. and not all
To put things more simply, you can use soup.select() (and via the comments, you can then use .get_text() to extract the text from each tag).
from bs4 import BeautifulSoup
import requests
resp = requests.get(
"https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
)
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "html.parser")
texts = []
for tag in soup.select(
# NB: this selector uses Python's implicit string concatenation
# to split it onto several lines.
".p24_regularListing "
".p24_propertyOverview "
".p24_propertyOverviewRow "
".p24_propertyOverviewKey"
):
texts.append(tag.get_text())
print(texts)

BeautifulSoup: Reading Span Class Elements

I am having some issues web scraping information from a particular pages span class element, using the beautifulsoup and requests addon in python. It keeps returning me blank information: " ". Heres my code:
headers = {'User-Agent':'Mozilla/5.0'}
res = requests.get('https://www.theweathernetwork.com/ca/weather/ontario/toronto')
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'html.parser')
weather_elem = soup.find('span', {'class':'wxcondition'})
weather = weather_elem
print(weather)
return weather`
The data is loaded through JavaScript so BeautifulSoup doesn't see anything. But you can simulate the Ajax with the requests module:
import json
import requests
from bs4 import BeautifulSoup
url = 'https://www.theweathernetwork.com/ca/weather/ontario/toronto'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
place_code = soup.select_one('link[rel="alternate"]')['href'].split('=')[-1].lower()
ajax_url = 'https://weatherapi.pelmorex.com/api/v1/observation/placecode/' + place_code
data = requests.get(ajax_url).json()
# uncomment to print all data:
# print(json.dumps(data, indent=4))
print(data['observation']['weatherCode']['text'])
Prints:
Partly cloudy

BS4 + html, b Tag issue

This question is about web scraping with bs4
this is the code I have written:
import requests
from bs4 import BeautifulSoup
import json
import csv
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')
#extract product score **(This is what I want to extract)**
stars = soup.select_one('a[class="score-lite"]', namespaces=None, flags=0)
#score = json.loads(stars)
print('Stars', stars)
My outcome:
<a class="score-lite" data-spm-click="gostr=/details.index.reviewLevel;locaid=dreviewLevel" href="https://onuliss.en.alibaba.com/company_profile/feedback.html" target="_blank"><b>4.8 </b><img src="//img.alicdn.com/tfs/TB1MJPmiQL0gK0jSZFtXXXQCXXa-8-9.svg"/></a>
The outcome I want is just the 4.8 number between the 'b' tags
What do I have to do with the = soup.select_one() function?
Thank you very much :)
Try with a more specific selector, the string property of the match and strip() to get rid of eventual extra spaces.
import requests
from bs4 import BeautifulSoup
import json
import csv
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')
#extract product score **(This is what I want to extract)**
stars = soup.select_one('a[class="score-lite"] > b', namespaces=None, flags=0).get_text(strip=True)
#score = json.loads(stars)
print('Stars', stars)
Stars 4.8
how about SimplifiedDoc
import requests
from simplified_scrapy.simplified_doc import SimplifiedDoc
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a SimplifiedDoc object
doc = SimplifiedDoc(page.text)
# get element use tag and class
stars = doc.getElement('a','class',"score-lite")
print('Stars', stars.text, stars.b.text) # Stars 4.8 4.8
import requests
from bs4 import BeautifulSoup
r = requests.get(
'https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
soup = BeautifulSoup(r.text, 'html.parser')
if r.status_code == 200:
item = soup.find('a', {'class': 'score-lite'}).find('b')
print(item.get_text(strip=True))
output:
4.8

How do I scrape IP addresses through the use of BeautifulSoup and output to CSV?

import requests
from bs4 import BeautifulSoup
url ='https://myip.ms/browse/blacklist/Blacklist_IP_Blacklist_IP_Addresses_Live_Database_Real-time'
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
ipList = soup.find("td",{"class": "row_name"})
rows = ipList.findAll('td')
for tr in rows:
cols = td.findAll('td')
if len(cols) > 0:
print (ip.cols.text.strip())
I am doing web scraping using BeautifulSoup and I have encountered some problems. May I know why am I unable to scrape IP addresses from the database table. How do I output the results to a CSV file?
The problem is you are using find() for ipList, which fetches only one ip, you can use findall() or select which will return ip array.
import requests
from bs4 import BeautifulSoup
url ='https://myip.ms/browse/blacklist/Blacklist_IP_Blacklist_IP_Addresses_Live_Database_Real-time'
response = requests.get(url).content
soup = BeautifulSoup(response, 'html.parser')
ipList = soup.select(".row_name")
with open('ip_output.csv', 'w') as f:
for ips in ipList:
f.write(ips.find('a').text + '\n')
Output in csv
195.154.251.86
37.140.192.194
80.94.174.55
175.103.39.28
90.173.129.250
51.15.146.121
...
...

Categories