I want to scrape the longitude and the latitude of this restaurant - python

I tried all kinds of combinations I can think of but it always returns nonetype and never the longitude and latitude, I just give up.
import requests
from bs4 import BeautifulSoup as soup
import re
url = "https://www.google.com/maps/place/?q=place_id:ChIJPVsC-y9HWBQRzjV7If63wzw"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',}
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
a = soup.find("div", class_= "Z0LcW").text
print(a)
AttributeError: 'NoneType' object has no attribute 'text'
or this:
import requests
from bs4 import BeautifulSoup as soup
import re
url = "https://www.google.com/maps/place/?q=place_id:ChIJPVsC-y9HWBQRzjV7If63wzw"
resp=requests.request(method="GET",url=url)
soup_parser = soup(resp.text, "html.parser")
html_content = soup_parser.html.contents[1]
_script = html_content.find_all("script")[7]
matches=re.findall("(-\d+\.\d{7})",_script.text)
print(matches[0],matches[1])
returns this
IndexError Traceback (most recent call last)
Cell In[54], line 15
12 _script = html_content.find_all("script")[7]
14 matches=re.findall("(-\d+\.\d{7})",_script.text)
---> 15 print(matches[0],matches[1])
IndexError: list index out of range
Edit: I have an API KEY for googlemaps api, If you know how to geocode the place_id returned by the request, That'll work too!
The problem with the request made by places_nearby() is the location returned is the location of the district, I want the location of the restaurant!

Related

How to get attribute from element using beautifulsoup?

Here's a bit html from a web page:
<bg-quote class="value negative" field="Last" format="0,0.00" channel="/zigman2/quotes/203558040/composite,/zigman2/quotes/203558040/lastsale" data-last-stamp="1624625999626" data-last-raw="671.68">671.68</bg-quote>
So I want to get the value of attribute "data-last-raw", but find() -method seems to return None when searching for this element. Why is this and how can I fix it?
My code and Traceback below:
import requests
from bs4 import BeautifulSoup as BS
import tkinter as tk
class Scraping:
#classmethod
def get_to_site(cls, stock_name):
sitename = 'https://www.marketwatch.com/investing/stock/tsla' + stock_name
site = requests.get(sitename, headers={
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate",
"Accept-Language":"en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7",
"Connection":"keep-alive",
"Host":"www.marketwatch.com",
"Referer":"https://www.marketwatch.com",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36"
})
print(site.status_code)
src = site.content
Scraping.get_price(src)
#classmethod
def get_price(cls, src):
soup = BS(src, "html.parser")
price_holder = soup.find("bg-quote", {"channel":"/zigman2/quotes/203558040/composite,/zigman2/quotes/203558040/lastsale"})
price = price_holder["data-last-raw"]
print(price)
Scraping.get_to_site('tsla')
200
Traceback (most recent call last):
File "c:\Users\Aatu\Documents\python\pythonleikit\stock_price_scraper.py", line 41, in <module>
Scraping.get_to_site('tsla')
File "c:\Users\Aatu\Documents\python\pythonleikit\stock_price_scraper.py", line 30, in get_to_site
Scraping.get_price(src)
File "c:\Users\Aatu\Documents\python\pythonleikit\stock_price_scraper.py", line 36, in get_price
price = price_holder["data-last-raw"]
TypeError: 'NoneType' object is not subscriptable
So site.status_code returns 200 to indicate that the site is opened correctly, but I think the soup.find() -method returns None to indicate that the element I was looking for was not found.
Somebody pls help!
import requests
from bs4 import BeautifulSoup
def main(ticker):
r = requests.get(f'https://www.marketwatch.com/investing/stock/{ticker}')
soup = BeautifulSoup(r.text, 'lxml')
print(soup.select_one('bg-quote.value:nth-child(2)').text)
if __name__ == "__main__":
main('tsla')
Output:
670.99

python stripping text and definition of names

I am scraping car name and car price from a car site to subsequently append into a table which could be saved to an excel file. I need help to strip all codes except for the car name details, and to overcome this 'names not defined' problem.The following are the codes
from bs4 import BeautifulSoup
from requests import get
import pandas as pd
headers = ({'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit\
/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'})
base_url = "https://www.carlist.my/used-cars-for-sale/malaysia"
response = get(base_url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
print(html_soup)
html_soup = BeautifulSoup(response.text, 'html.parser')
content_list = html_soup.find_all('div', attrs={'class': 'grid__item'})
print(content_list)
basic_info = []
for item in content_list:
basic_info.append(item.find_all('a', attrs={'class': 'ellipsize js-ellipsize-text'}))
print(basic_info)
def get_names(basic_info):
names = []
for item in basic_info:
for i in item:
names.append(i.find_all('a', attrs = {'class' : '"ellipsize js-ellipsize-text'})
[0].text.strip())
return names
data = pd.DataFrame({'Name' : names})[['Name']]
data.head()
data.drop_duplicates().to_excel('Car_list.xls')
NameError Traceback (most recent call last)
<ipython-input-15-e2eba5476dff> in <module>
6 return names
7
----> 8 data = pd.DataFrame({'Name' : names})[['Name']]
9 data.head()
10 data.drop_duplicates().to_excel('Car_list.xls')
NameError: name 'names' is not defined

Python web scrape numerical weather data

I am attempting to print the int value of current outside air temperature. (55)
Any chance for a tip on what I am doing wrong? (sorry not a lot of wisdom here!)
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime as dt
#this is used at the end with plotting results to current hour
h = dt.datetime.now().hour
r = requests.get(
'https://www.google.com/search?q=weather+duluth')
soup = BeautifulSoup(r.text, 'html.parser')
stuff = []
for item in soup.select('vk_bk sol-tmp'):
item = int(item.contents[1].get_text(strip=True)[:-1])
#print(item)#this is weather data
stuff.append(item)
This is the web URL for weather and the current outdoor temperature is tied to the div class highlighted below.
If I attempt to print stuff I just get an empty list returned.
Adding User-Agent header should give expected result
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
r = requests.get('https://www.google.com/search?q=weather%20duluth', headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
soup.find("span", {"class": "wob_t"}).text

Web scraping twitter

I want to do web scraping on twitter page to download tweets on a specific search word. I am not able to fetch recursively all the tweets, rather I can fetch 20 tweets. Please help to fetch all the tweets recursively. Below is the code
from bs4 import BeautifulSoup
import requests
import pandas as pd
company_name = 'ABC'
url = 'https://twitter.com/search?q=%23%27%20%20%20' + company_name + '&src=typd&lang=en'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
req = requests.get(url, headers=headers);#print(req)
data = req.text;# print(data)
# soup = BeautifulSoup(data, "lxml");# print(soup)
soup = BeautifulSoup(data, "html.parser");# print(soup)
tweets = [p.text for p in soup.findAll('p', class_='tweet-text')]
# print(tweets)
df = pd.DataFrame()
df['Tweet'] = tweets
print(df.head())
print(df.shape)

Beautifulsoup parsing error

I am trying to extract some information about an App on Google Play and BeautifulSoup doesn't seem to work.
The link is this(say):
https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts
My code:
url = "https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts"
r = requests.get(url)
html = r.content
soup = BeautifulSoup(html)
l = soup.find_all("div", { "class" : "document-subtitles"})
print len(l)
0 #How is this 0?! There is clearly a div with that class
I decided to go all in, didn't work either:
i = soup.select('html body.no-focus-outline.sidebar-visible.user-has-no-subscription div#wrapper.wrapper.wrapper-with-footer div#body-content.body-content div.outer-container div.inner-container div.main-content div div.details-wrapper.apps.square-cover.id-track-partial-impression.id-deep-link-item div.details-info div.info-container div.info-box-top')
print i
What am I doing wrong?
You need to pretend to be a real browser by supplying the User-Agent header:
import requests
from bs4 import BeautifulSoup
url = "https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts"
r = requests.get(url, headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
})
html = r.content
soup = BeautifulSoup(html, "html.parser")
title = soup.find(class_="id-app-title").get_text()
rating = soup.select_one(".document-subtitle .star-rating-non-editable-container")["aria-label"].strip()
print(title)
print(rating)
Prints the title and the current rating:
Weird Facts
Rated 4.3 stars out of five stars
To get the additional information field values, you can use the following generic function:
def get_info(soup, text):
return soup.find("div", class_="title", text=lambda t: t and t.strip() == text).\
find_next_sibling("div", class_="content").get_text(strip=True)
Then, if you do:
print(get_info(soup, "Size"))
print(get_info(soup, "Developer"))
You will see printed:
1.4M
Email email#here.com

Categories