Pandas to_csv only write the data from certain page

Pandas to_csv only write the data from certain page - python

I tried to scrape data from tripadvisor, but from several pages that I tried to scrape, when I try to export it to csv it only shows 1 line of data and gives an error message like this
AttributeError: 'NoneType' object has no attribute 'text'
this is my code
import requests
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
URL = 'https://www.tripadvisor.com/Attraction_Review-g469404-d3780963-Reviews-oa'
for offset in range(0, 30, 10):
url = URL + str(offset) + '-Double_Six_Beach-Seminyak_Kuta_District_Bali.html'
headers = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
container = soup.find_all('div', {'class':'_2rspOqPP'})
for r in container:
reviews = r.find_all('div', {'class': None})
#the container that contains the elements that I want to scrape has no attributes and use DOM element. So I tried to access div with _2rspOqPP class first then access the div with no attributes from there
records = []
for review in reviews:
user = review.find('a', {'class':'_7c6GgQ6n _37QDe3gr WullykOU _3WoyIIcL'}).text
country = review.find('div', {'class' : 'DrjyGw-P _26S7gyB4 NGv7A1lw _2yS548m8 _2cnjB3re _1TAWSgm1 _1Z1zA2gh _2-K8UW3T _1dimhEoy'}).span.text
date = review.find('div', {'class' : '_3JxPDYSx'}).text
content = review.find('div', {'class' : 'DrjyGw-P _26S7gyB4 _2nPM5Opx'}).text
records.append((user, country, date, content))
df = pd.DataFrame(records, columns=['Name', 'Country', 'Date', 'Content'])
df.to_csv('doublesix_.csv', index=False, encoding='utf-8')
Code updated
for r in container:
reviews = r.find_all('div', {'class': None})
records = []
for review in reviews:
try:
user = review.find('a', {'class':'_7c6GgQ6n _37QDe3gr WullykOU _3WoyIIcL'}).text
country = review.find('div', {'class' : 'DrjyGw-P _26S7gyB4 NGv7A1lw _2yS548m8 _2cnjB3re _1TAWSgm1 _1Z1zA2gh _2-K8UW3T _1dimhEoy'}).span.text
date = review.find('div', {'class' : '_3JxPDYSx'}).text
content = review.find('div', {'class' : 'DrjyGw-P _26S7gyB4 _2nPM5Opx'}).text
records.append((user, country, date, content))
except:
pass
print(records)
df = pd.DataFrame(records, columns=['Name', 'Country', 'Date', 'Content'])
df.to_csv('doublesix_.csv', index=False, encoding='utf-8')

You should move the records out of the for loops and unindent the last few lines.
See this:
import pandas as pd
import requests
from bs4 import BeautifulSoup
main_url = 'https://www.tripadvisor.com/Attraction_Review-g469404-d3780963-Reviews-oa'
country_class = "DrjyGw-P _26S7gyB4 NGv7A1lw _2yS548m8 _2cnjB3re _1TAWSgm1 _1Z1zA2gh _2-K8UW3T _1dimhEoy"
records = []
for offset in range(0, 30, 10):
url = main_url + str(offset) + '-Double_Six_Beach-Seminyak_Kuta_District_Bali.html'
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
}
soup = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")
container = soup.find_all('div', {'class': '_2rspOqPP'})
for r in container:
reviews = r.find_all('div', {'class': None})
for review in reviews:
try:
user = review.find('a', {'class': '_7c6GgQ6n _37QDe3gr WullykOU _3WoyIIcL'}).text
country = review.find('div', {'class': country_class}).span.text
date = review.find('div', {'class': '_3JxPDYSx'}).text
content = review.find('div', {'class': 'DrjyGw-P _26S7gyB4 _2nPM5Opx'}).text
records.append((user, country, date, content))
except AttributeError:
pass
df = pd.DataFrame(records, columns=['Name', 'Country', 'Date', 'Content'])
df.to_csv('doublesix_.csv', index=False, encoding='utf-8')
Output from the .csv file:

Related

how to scrape page inside the result card using Bs4?

<img class="no-img" data-src="https://im1.dineout.co.in/images/uploads/restaurant/sharpen/4/h/u/p4059-15500352575c63a9394c209.jpg?tr=tr:n-medium" alt="Biryani By Kilo" data-gatype="RestaurantImageClick" data-url="/delhi/biryani-by-kilo-connaught-place-central-delhi-40178" data-w-onclick="cardClickHandler" src="https://im1.dineout.co.in/images/uploads/restaurant/sharpen/4/h/u/p4059-15500352575c63a9394c209.jpg?tr=tr:n-medium">
page url - https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p=1
this page contains some restaurants card now while scrapping the page in the loop I want to go inside the restaurant card URL which is in the above HTML code name by data-url class and scrape the no. of reviews from inside it, I don't know how to do it my current code for normal front page scrapping is ;
def extract(page):
url = f"https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p={page}" # URL of the website
header = {'User-Agent':'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'} # Temporary user agent
r = requests.get(url, headers=header)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup): # function to scrape the page
divs = soup.find_all('div', class_ = 'restnt-card restaurant')
for item in divs:
title = item.find('a').text.strip() # restaurant name
loc = item.find('div', class_ = 'restnt-loc ellipsis').text.strip() # restaurant location
try: # used this try and except method because some restaurants are unrated and while scrpaping those we would run into an error
rating = item.find('div', class_="img-wrap").text
rating = (re.sub("[^0-9,.]", "", rating))
except:
rating = None
pricce = item.find('span', class_="double-line-ellipsis").text.strip() # price for biriyani
price = re.sub("[^0-9]", "", pricce)[:-1]
biry_del = {
'name': title,
'location': loc,
'rating': rating,
'price': price
}
rest_list.append(biry_del)
rest_list = []
for i in range(1,18):
print(f'getting page, {i}')
c = extract(i)
transform(c)
I hope you guys understood please ask in comment for any confusion.

It's not very fast but it looks like you can get all the details you want including the review count (not 232!) if you hit this backend api endpoint:
https://www.dineout.co.in/get_rdp_data_main/delhi/69676/restaurant_detail_main
import requests
from bs4 import BeautifulSoup
import pandas as pd
rest_list = []
for page in range(1,3):
print(f'getting page, {page}')
s = requests.Session()
url = f"https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p={page}" # URL of the website
header = {'User-Agent':'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'} # Temporary user agent
r = s.get(url, headers=header)
soup = BeautifulSoup(r.content, 'html.parser')
divs = soup.find_all('div', class_ = 'restnt-card restaurant')
for item in divs:
code = item.find('a')['href'].split('-')[-1] # restaurant code
print(f'Getting details for {code}')
data = s.get(f'https://www.dineout.co.in/get_rdp_data_main/delhi/{code}/restaurant_detail_main').json()
info = data['header']
info.pop('share') #clean up csv
info.pop('options')
rest_list.append(info)
df = pd.DataFrame(rest_list)
df.to_csv('dehli_rest.csv',index=False)

Exception has occurred: AttributeError 'NoneType' object has no attribute 'text'

I am trying to scrape multiple pages using grequests and beautifulsoup. I am able to scrape one single page but when I change it to iterate over multiple pages I am getting the above error listed in the title.
CODE:
from bs4 import BeautifulSoup
import pandas as pd
_city = input('Enter the name of the City and State, example format(miami-fl): ')
headers = {'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0'}
def get_urls():
urls = []
for x in range(1,2):
urls.append(f'https://www.apartments.com/miami-fl/{x}/')
return urls
def get_data(urls):
reqs = [grequests.get(link) for link in urls]
resp = grequests.map(reqs)
return resp
def parse(resp):
apartments = []
for r in resp:
soup = BeautifulSoup(r.text, 'lxml')
results = soup.find_all('li', {'class': 'mortar-wrapper'})
for item in results:
apartment = {
'Property_name': item.find('span', {'class': 'js-placardTitle title'}).text,
'Unit_name': item.find(''),
'Formatted_address': item.find('div', {'class': 'property-address js-url'}).text,
'City&State': _city,
'Bedrooms': item.find('div', {'class': 'bed-range'}).text,
'Price_Range': item.find('div', {'class': 'price-range'}).text,
'Availability': item.find('div', {'class': 'availability'}).text,
'Property_Amenities': item.find('div', {'class': 'property-amenities'}).text.strip(),
'Phone_Number': item.find('a', {'class': 'phone-link js-phone'}).attrs['href'],
}
apartments.append(apartment)
print(apartments)
return apartments
#def output(apartments):
aptdf = pd.DataFrame(apartments)
aptdf.to_csv('apts.csv', index=False)
print('Saved to CSV')
return
if __name__ == '__main__':
urls = get_urls()
resp = get_data(urls)
df = pd.DataFrame(parse(resp))
df.to_csv('apts.csv', index=False)
#output(apartments)```
edited code to correct format but still wont run or debug

How do I capture the URL of each job so I can open full job description when looking at csv file

Can someone help me modify this script so that it also scraps the URL associated with each job. The purpose would be when browsing the .csv file in a spreadsheet I can click on the link if I would like to know more information about the job. Thank you in advance.
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract(page):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}
url= f'https://www.indeed.com/jobs?q=Dispensary&l=Denver%2C+CO&radius={page}'
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup):
divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
for item in divs:
title = item.find('a').text.strip()
company = item.find('span', class_ = 'company').text.strip()
try:
salary = item.find('span', class_ = 'salaryText').text.strip()
except:
salary = ''
summary = item.find('div', class_ = 'summary').text.strip().replace('\n', '')
job = {
'title': title,
'company': company,
'salary': salary,
'summary': summary
}
joblist.append(job)
return
joblist = []
for i in range(0,90,10):
print(f'Getting page, {i}')
c = extract(0)
transform(c)
df = pd.DataFrame(joblist)
print(df.head())
df.to_csv('jobs.csv')

You can use one of this
url = 'https://www.indeed.com' + item.find('a')['href']
url = 'https://www.indeed.com' + item.find('a').get('href')
url = 'https://www.indeed.com' + item.find('a').attrs['href']
url = 'https://www.indeed.com' + item.find('a').attrs.get('href')
BTW:
You always load the same page. To get next page you have to use start=... in url.
And you can do this more readable using dictionary and params= in requests
payload = {
'q': 'Dispensary',
'l': 'Denver,+CO',
'radius': 0,
'start': page,
}
url= 'https://www.indeed.com/jobs'
r = requests.get(url, params=payload, headers=headers)
Working code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract(start):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
}
payload = {
'q': 'Dispensary',
'l': 'Denver,+CO',
'radius': 0,
'start': start,
}
url= 'https://www.indeed.com/jobs'
r = requests.get(url, params=payload, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup, joblist):
divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
for item in divs:
title = item.find('a').text.strip()
url = 'https://www.indeed.com' + item.find('a')['href']
#url = 'https://www.indeed.com' + item.find('a').get('href')
#url = 'https://www.indeed.com' + item.find('a').attrs['href']
#url = 'https://www.indeed.com' + item.find('a').attrs.get('href')
company = item.find('span', class_ = 'company').text.strip()
try:
salary = item.find('span', class_ = 'salaryText').text.strip()
except:
salary = ''
summary = item.find('div', class_ = 'summary').text.strip().replace('\n', '')
joblist.append({
'title': title,
'url': url,
'company': company,
'salary': salary,
'summary': summary
})
# --- main ---
joblist = []
for start in range(0, 90, 10):
print('Getting page', start)
c = extract(start)
transform(c, joblist)
df = pd.DataFrame(joblist)
df.to_csv('jobs.csv')
print(df.head())

Issue while extracting table data using beautifulsoup in python

The code below gives me the required product data as a table. It works fine for most links, however in some it stops midway giving an error NoneType object has no attribute find_all in table.find_all('tr).
I believe this is because table does not exist in some products, so I tried creating an if condition on the existence of the table, but that also doesn't seem to help. What changes should I make in the code below?
import requests, json, time
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.1800wheelchair.com/category/toilet-accessories/?p="
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
data = []
for i in range(1,3):
print(i)
res = requests.get(url + str(i), headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
p_links = [i["data-link"] for i in soup.find("ul", {"id":"products-list"}).find_all("li",class_=["openlink","item"])]
for prod_url in p_links:
print(prod_url)
temp = {"Product URL": prod_url}
prod_res = requests.get(prod_url, headers = headers)
prod_soup = BeautifulSoup(prod_res.text, "html.parser")
for p in prod_soup.find("div", class_="basic-information").find_all("p"):
if "item" in p.text.lower(): temp["item number"] = p.find("span").text.strip()
elif "brand" in p.text.lower(): temp["manufacturer"] = p.find("span").text.strip()
elif "sku" in p.text.lower(): temp["sku"] = p.find("span").text.strip()
table = prod_soup.find("table",{"class":"specifications"})
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
data.append(temp)
pd.DataFrame(data).to_csv("toilet-acc.csv", index=False)

You can use Try and Except (documentation):
try:
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
except:
pass

you can use this:
tables = soup.select('table', attrs={"class":"specifications"})
rows = tables.findChildren(['tr'])

import requests, json, time
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.1800wheelchair.com/category/toilet-accessories/?p="
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
data = []
for i in range(1,3):
print(i)
res = requests.get(url + str(i), headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
p_links = [i["data-link"] for i in soup.find("ul", {"id":"products-list"}).find_all("li",class_=["openlink","item"])]
for prod_url in p_links:
print(prod_url)
temp = {"Product URL": prod_url}
prod_res = requests.get(prod_url, headers = headers)
prod_soup = BeautifulSoup(prod_res.text, "html.parser")
try:
for p in prod_soup.find("div", class_="basic-information").find_all("p"):
if "item" in p.text.lower(): temp["item number"] = p.find("span").text.strip()
elif "brand" in p.text.lower(): temp["manufacturer"] = p.find("span").text.strip()
elif "sku" in p.text.lower(): temp["sku"] = p.find("span").text.strip()
table = prod_soup.find("table",{"class":"specifications"})
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
except:
print("Failed for URL {}".format(prod_url))
data.append(temp)
time.sleep(2)
pd.DataFrame(data).to_csv("toilet-acc.csv", index=False)
Put a try/except not only to extract product specification but also to extract item/brand/sku. But in the except put a print statement to know which all urls failed so that you can try them again

python stripping text and definition of names

I am scraping car name and car price from a car site to subsequently append into a table which could be saved to an excel file. I need help to strip all codes except for the car name details, and to overcome this 'names not defined' problem.The following are the codes
from bs4 import BeautifulSoup
from requests import get
import pandas as pd
headers = ({'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit\
/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'})
base_url = "https://www.carlist.my/used-cars-for-sale/malaysia"
response = get(base_url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
print(html_soup)
html_soup = BeautifulSoup(response.text, 'html.parser')
content_list = html_soup.find_all('div', attrs={'class': 'grid__item'})
print(content_list)
basic_info = []
for item in content_list:
basic_info.append(item.find_all('a', attrs={'class': 'ellipsize js-ellipsize-text'}))
print(basic_info)
def get_names(basic_info):
names = []
for item in basic_info:
for i in item:
names.append(i.find_all('a', attrs = {'class' : '"ellipsize js-ellipsize-text'})
[0].text.strip())
return names
data = pd.DataFrame({'Name' : names})[['Name']]
data.head()
data.drop_duplicates().to_excel('Car_list.xls')
NameError Traceback (most recent call last)
<ipython-input-15-e2eba5476dff> in <module>
6 return names
7
----> 8 data = pd.DataFrame({'Name' : names})[['Name']]
9 data.head()
10 data.drop_duplicates().to_excel('Car_list.xls')
NameError: name 'names' is not defined

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Pandas to_csv only write the data from certain page - python

Related

how to scrape page inside the result card using Bs4?

Exception has occurred: AttributeError 'NoneType' object has no attribute 'text'

How do I capture the URL of each job so I can open full job description when looking at csv file

Issue while extracting table data using beautifulsoup in python

python stripping text and definition of names

Categories

Resources