python stripping text and definition of names

python stripping text and definition of names - python

I am scraping car name and car price from a car site to subsequently append into a table which could be saved to an excel file. I need help to strip all codes except for the car name details, and to overcome this 'names not defined' problem.The following are the codes
from bs4 import BeautifulSoup
from requests import get
import pandas as pd
headers = ({'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit\
/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'})
base_url = "https://www.carlist.my/used-cars-for-sale/malaysia"
response = get(base_url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
print(html_soup)
html_soup = BeautifulSoup(response.text, 'html.parser')
content_list = html_soup.find_all('div', attrs={'class': 'grid__item'})
print(content_list)
basic_info = []
for item in content_list:
basic_info.append(item.find_all('a', attrs={'class': 'ellipsize js-ellipsize-text'}))
print(basic_info)
def get_names(basic_info):
names = []
for item in basic_info:
for i in item:
names.append(i.find_all('a', attrs = {'class' : '"ellipsize js-ellipsize-text'})
[0].text.strip())
return names
data = pd.DataFrame({'Name' : names})[['Name']]
data.head()
data.drop_duplicates().to_excel('Car_list.xls')
NameError Traceback (most recent call last)
<ipython-input-15-e2eba5476dff> in <module>
6 return names
7
----> 8 data = pd.DataFrame({'Name' : names})[['Name']]
9 data.head()
10 data.drop_duplicates().to_excel('Car_list.xls')
NameError: name 'names' is not defined

Related

I want to scrape the longitude and the latitude of this restaurant

I tried all kinds of combinations I can think of but it always returns nonetype and never the longitude and latitude, I just give up.
import requests
from bs4 import BeautifulSoup as soup
import re
url = "https://www.google.com/maps/place/?q=place_id:ChIJPVsC-y9HWBQRzjV7If63wzw"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',}
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
a = soup.find("div", class_= "Z0LcW").text
print(a)
AttributeError: 'NoneType' object has no attribute 'text'
or this:
import requests
from bs4 import BeautifulSoup as soup
import re
url = "https://www.google.com/maps/place/?q=place_id:ChIJPVsC-y9HWBQRzjV7If63wzw"
resp=requests.request(method="GET",url=url)
soup_parser = soup(resp.text, "html.parser")
html_content = soup_parser.html.contents[1]
_script = html_content.find_all("script")[7]
matches=re.findall("(-\d+\.\d{7})",_script.text)
print(matches[0],matches[1])
returns this
IndexError Traceback (most recent call last)
Cell In[54], line 15
12 _script = html_content.find_all("script")[7]
14 matches=re.findall("(-\d+\.\d{7})",_script.text)
---> 15 print(matches[0],matches[1])
IndexError: list index out of range
Edit: I have an API KEY for googlemaps api, If you know how to geocode the place_id returned by the request, That'll work too!
The problem with the request made by places_nearby() is the location returned is the location of the district, I want the location of the restaurant!

Why am I getting Attribute error: nonetype object has no attribute get_text whenever I try to scrape this ecommerce store

I'm trying to scrape an ecommerce store but getting Attribute error: nonetype object has no attribute get_text. This happens whenever i try to iterate between each products through the product link. I'm confused if am running into a javascript or captcha or whatnot don't know. Here's my code
import requests
from bs4 import BeautifulSoup
baseurl = 'https://www.jumia.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
productlinks = []
for x in range(1,51):
r = requests.get(f'https://www.jumia.com.ng/ios-phones/?page={x}#catalog-listing/')
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('article', class_='prd _fb col c-prd')
for product in productlist:
for link in product.find_all('a', href=True):
productlinks.append(baseurl + link['href'])
for link in productlinks:
r = requests.get(link, headers = headers)
soup = BeautifulSoup(r.content, 'lxml')
name = soup.find('h1', class_='-fs20 -pts -pbxs').get_text(strip=True)
amount = soup.find('span', class_='-b -ltr -tal -fs24').get_text(strip=True)
review = soup.find('div', class_='stars _s _al').get_text(strip=True)
rating = soup.find('a', class_='-plxs _more').get_text(strip=True)
features = soup.find_all('li', attrs={'style': 'box-sizing: border-box; padding: 0px; margin: 0px;'})
a = features[0].get_text(strip=True)
b = features[1].get_text(strip=True)
c = features[2].get_text(strip=True)
d = features[3].get_text(strip=True)
e = features[4].get_text(strip=True)
f = features[5].get_text(strip=True)
print(f"Name: {name}")
print(f"Amount: {amount}")
print(f"Review: {review}")
print(f"Rating: {rating}")
print('Key Features')
print(f"a: {a}")
print(f"b: {b}")
print(f"c: {c}")
print(f"d: {d}")
print(f"e: {e}")
print(f"f: {f}")
print('')
Here's the error message:
Traceback (most recent call last):
File "c:\Users\LP\Documents\jumia\jumia.py", line 32, in <module>
name = soup.find('h1', class_='-fs20 -pts -pbxs').get_text(strip=True)
AttributeError: 'NoneType' object has no attribute 'get_text'
PS C:\Users\LP\Documents\jumia> here

Change the variable baseurl to https://www.jumia.com.ng and change the features variable to features = soup.find('article', class_='col8 -pvs').find_all('li'). After fixing those two issues, you'll probably get an IndexError because not every page has six features listed. You can use something like the following code to iterate through the features and print them:
for i, feature in enumerate(features):
print(chr(ord("a")+i) + ":", feature.get_text(strip=True))
With this for loop, you don't need the a to f variables. The chr(ord("a")+i part gets the letter corresponding to index i. However, if there are more than 26 features this will print punctuation characters or garbage. This can be trivially fixed by breaking the loop when i>25. This trick won't work on EBCDIC systems, only ASCII ones.
Even after making these three changes, there was an AttributeError when it tried to scrape a link to a product unrelated to iPhones, which showed up on page 5 of the results. I don't know how the script got that link; it was a medicinal cream. To fix that, either wrap the body of the second for loop in a try except like the following or put the last line of the first for loop under a if 'iphone' in link.
for link in productlinks:
try:
# body of for loop goes here
except AttributeError:
continue
With these changes, the script would look like this:
import requests
from bs4 import BeautifulSoup
baseurl = 'https://www.jumia.com.ng'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
productlinks = []
for x in range(1,51):
r = requests.get(f'https://www.jumia.com.ng/ios-phones/?page={x}#catalog-listing/')
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('article', class_='prd _fb col c-prd')
for product in productlist:
for link in product.find_all('a', href=True):
if 'iphone' in link['href']:
productlinks.append(baseurl + link['href'])
for link in productlinks:
r = requests.get(link, headers = headers)
soup = BeautifulSoup(r.content, 'lxml')
try:
name = soup.find('h1', class_='-fs20 -pts -pbxs').get_text(strip=True)
amount = soup.find('span', class_='-b -ltr -tal -fs24').get_text(strip=True)
review = soup.find('div', class_='stars _s _al').get_text(strip=True)
rating = soup.find('a', class_='-plxs _more').get_text(strip=True)
features = soup.find('article', class_='col8 -pvs').find_all('li')
print(f"Name: {name}")
print(f"Amount: {amount}")
print(f"Review: {review}")
print(f"Rating: {rating}")
print('Key Features')
for i, feature in enumerate(features):
if i > 25: # we ran out of letters
break
print(chr(ord("a")+i) + ":", feature.get_text(strip=True))
print('')
except AttributeError:
continue

Scraping a website keeps returning 'NoneType' object has no attribute 'find' error (BS4)

I'm new to BS4. So I'm having a hard time decoding why this error keeps coming. I want to find the book name, rank, author, rating and price of books, but every time I run the code, all of them keeps on returning that error.
Here's my code:
try:
url = requests.get("https://www.amazon.in/gp/bestsellers/books/1318158031/ref=zg_bs_nav_books_1")
url.raise_for_status()
soup = BeautifulSoup(url.text, "html.parser")
books = soup.find("div", class_="p13n-gridRow _cDEzb_grid-row_3Cywl").find_all("div", id="gridItemRoot")
for book in books:
rank = book.find("div", class_="aok-float-left").span.text.split("#")[1]
name = book.find("div", class_="zg-grid-general-faceout").span.div.text
author = book.find("div", class_="a-row a-size-small").div.text
rating = book.find("div", class_="a-icon-row").find("a", class_="a-link-normal")(["title"])
price = book.find("div", class_="a-row").next_sibling.next_sibling.next_sibling.span.text
print(name)
except Exception as e:
print(e)

You are getting NoneType error because some items are missing in the listing/in html tree. So to ged rid of such error, you can use if else None statement
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
url='https://www.amazon.in/gp/bestsellers/books/1318158031/ref=zg_bs_nav_books_1'
req = requests.get(url,headers=headers)
print(req)
soup = BeautifulSoup(req.text, 'lxml')
books = soup.find("div", class_="p13n-gridRow _cDEzb_grid-row_3Cywl").find_all("div", id="gridItemRoot")
for book in books:
rank = book.find("div", class_="aok-float-left").span.text.split("#")[1]
name = book.find("div", class_="zg-grid-general-faceout").span.div
name=name.text if name else None
author = book.find("div", class_="a-row a-size-small").div.text
rating = book.find("span", class_="a-icon-alt")
rating=rating.text if rating else None
price = book.select_one(".a-size-base.a-color-price span").text
#price=price.text if price else None
print(price)
Output:
₹299.00
₹316.00
₹139.00
₹323.00
₹284.05
₹761.00
₹187.00
₹299.00
₹222.30
₹299.00
₹299.00
₹550.00
₹139.00
₹305.99
₹299.00
₹256.00
₹297.00
₹1,012.00
₹309.00
₹109.00
₹399.00
₹292.74
₹289.75
₹410.00
₹279.30
₹125.00
₹313.95
₹449.00
₹357.00
₹198.00

Pandas to_csv only write the data from certain page

I tried to scrape data from tripadvisor, but from several pages that I tried to scrape, when I try to export it to csv it only shows 1 line of data and gives an error message like this
AttributeError: 'NoneType' object has no attribute 'text'
this is my code
import requests
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
URL = 'https://www.tripadvisor.com/Attraction_Review-g469404-d3780963-Reviews-oa'
for offset in range(0, 30, 10):
url = URL + str(offset) + '-Double_Six_Beach-Seminyak_Kuta_District_Bali.html'
headers = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
container = soup.find_all('div', {'class':'_2rspOqPP'})
for r in container:
reviews = r.find_all('div', {'class': None})
#the container that contains the elements that I want to scrape has no attributes and use DOM element. So I tried to access div with _2rspOqPP class first then access the div with no attributes from there
records = []
for review in reviews:
user = review.find('a', {'class':'_7c6GgQ6n _37QDe3gr WullykOU _3WoyIIcL'}).text
country = review.find('div', {'class' : 'DrjyGw-P _26S7gyB4 NGv7A1lw _2yS548m8 _2cnjB3re _1TAWSgm1 _1Z1zA2gh _2-K8UW3T _1dimhEoy'}).span.text
date = review.find('div', {'class' : '_3JxPDYSx'}).text
content = review.find('div', {'class' : 'DrjyGw-P _26S7gyB4 _2nPM5Opx'}).text
records.append((user, country, date, content))
df = pd.DataFrame(records, columns=['Name', 'Country', 'Date', 'Content'])
df.to_csv('doublesix_.csv', index=False, encoding='utf-8')
Code updated
for r in container:
reviews = r.find_all('div', {'class': None})
records = []
for review in reviews:
try:
user = review.find('a', {'class':'_7c6GgQ6n _37QDe3gr WullykOU _3WoyIIcL'}).text
country = review.find('div', {'class' : 'DrjyGw-P _26S7gyB4 NGv7A1lw _2yS548m8 _2cnjB3re _1TAWSgm1 _1Z1zA2gh _2-K8UW3T _1dimhEoy'}).span.text
date = review.find('div', {'class' : '_3JxPDYSx'}).text
content = review.find('div', {'class' : 'DrjyGw-P _26S7gyB4 _2nPM5Opx'}).text
records.append((user, country, date, content))
except:
pass
print(records)
df = pd.DataFrame(records, columns=['Name', 'Country', 'Date', 'Content'])
df.to_csv('doublesix_.csv', index=False, encoding='utf-8')

You should move the records out of the for loops and unindent the last few lines.
See this:
import pandas as pd
import requests
from bs4 import BeautifulSoup
main_url = 'https://www.tripadvisor.com/Attraction_Review-g469404-d3780963-Reviews-oa'
country_class = "DrjyGw-P _26S7gyB4 NGv7A1lw _2yS548m8 _2cnjB3re _1TAWSgm1 _1Z1zA2gh _2-K8UW3T _1dimhEoy"
records = []
for offset in range(0, 30, 10):
url = main_url + str(offset) + '-Double_Six_Beach-Seminyak_Kuta_District_Bali.html'
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
}
soup = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")
container = soup.find_all('div', {'class': '_2rspOqPP'})
for r in container:
reviews = r.find_all('div', {'class': None})
for review in reviews:
try:
user = review.find('a', {'class': '_7c6GgQ6n _37QDe3gr WullykOU _3WoyIIcL'}).text
country = review.find('div', {'class': country_class}).span.text
date = review.find('div', {'class': '_3JxPDYSx'}).text
content = review.find('div', {'class': 'DrjyGw-P _26S7gyB4 _2nPM5Opx'}).text
records.append((user, country, date, content))
except AttributeError:
pass
df = pd.DataFrame(records, columns=['Name', 'Country', 'Date', 'Content'])
df.to_csv('doublesix_.csv', index=False, encoding='utf-8')
Output from the .csv file:

How do I get Market Cap data for a company using urllib and regex?

When I run the following:
try:
url = 'http://www.zacks.com/stock/quote/AAPL'
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17'
values = {'quote': 'aapl'}
data = urllib.parse.urlencode(values)
data = data.encode('utf-8')
req = urllib.request.Request(url, headers=headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
marketcap = re.findall(r'<span>(.*?)</span>',str(respData))
for eachP in marketcap:
print(eachP)
except Exception as e:
print(str(e))
Python returns:
Menu
Back to top
&
USD
48,485,528
884.23 B
2.52 ( 1.51%)
1.31
I am looking for the 884.23 B value and I tried to get that through adding an index number in the following line.
marketcap = re.findall(r'<span>(.*?)</span>',str(respData))[15]
Once I do that, python returns this:
8
8
4
.
2
3
B
But instead of that, I would like the program to return the following:
884.23 B
Help would be much appreciated. Thank you in advance!

You can use Beautifulsoup to scrap a website.
EX:
import requests
from bs4 import BeautifulSoup
r = requests.get("http://www.zacks.com/stock/quote/AAPL")
soup = BeautifulSoup(r.content, "html.parser")
for tr in soup.findAll("table", class_="abut_bottom"):
for td in tr.find_all("td"):
if td.text == "Market Cap":
print td.text, td.find_next_sibling("td").text
Output:
Market Cap 884.23 B

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

python stripping text and definition of names - python

Related

I want to scrape the longitude and the latitude of this restaurant

Why am I getting Attribute error: nonetype object has no attribute get_text whenever I try to scrape this ecommerce store

Scraping a website keeps returning 'NoneType' object has no attribute 'find' error (BS4)

Pandas to_csv only write the data from certain page

How do I get Market Cap data for a company using urllib and regex?

Categories

Resources