while True:
for rate in soup.find_all('div',{"class":"rating"}):
if rate.img is not None:
print (rate.img['alt'])
try:
driver.find_element_by_link_text('Next').click()
except:
break
driver.quit()
while True:
for rate in soup.findAll('div',{"class":"listing_title"}):
print (rate.a.text)
try:
driver.find_element_by_link_text('Next').click()
except:
break
driver.quit()
This should do what you're looking for. You should grab the parent class of both (I chose .listing, and get each attribute from there, insert them in a dict, and then write the dicts to CSV with the Python CSV library. Just as a fair warning, I didn't run it until it broke, I just broke after the second loop to save some computing.
WARNING HAVE NOT TESTED ON FULL SITE
import csv
import time
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
url = 'http://www.tripadvisor.in/Hotels-g186338-London_England-Hotels.html'
driver = webdriver.Firefox()
driver.get(url)
hotels = []
while True:
html = driver.page_source
soup = BeautifulSoup(html)
listings = soup.select('div.listing')
for l in listings:
hotel = {}
hotel['name'] = l.select('a.property_title')[0].text
hotel['rating'] = float(l.select('img.sprite-ratings')[0]['alt'].split('of')[0])
hotels.append(hotel)
next = driver.find_element_by_link_text('Next')
if not next:
break
else:
next.click()
time.sleep(0.5)
if len(hotels) > 0:
with open('ratings.csv', 'w') as f:
fieldnames = [ k for k in hotels[0].keys() ]
writer = csv.DictWriter(f,fieldnames=fieldnames)
writer.writeheader()
for h in hotels:
writer.writerow(h)
driver.quit()
You should look at using a list.
I would try something like this:
for rate in soup.findAll('div',{"class":["rating","listing_title"]}):
(could be wrong, this machine doesn't have bs4 for me to check, sorry)
Related
I have several URLs which link to Hotel pages and I would like to scrape some data from it.
I'm using the following this script, but I would like to update it:
data=[]
for i in range(0,10):
url = final_list[i]
driver2 = webdriver.Chrome()
driver2.get(url)
sleep(randint(10,20))
soup = BeautifulSoup(driver2.page_source, 'html.parser')
my_table2 = soup.find_all(class_=['title-2', 'rating-score body-3'])
review=soup.find_all(class_='reviews')[-1]
try:
price=soup.find_all('span', attrs={'class':'price'})[-1]
except:
price=soup.find_all('span', attrs={'class':'price'})
for tag in my_table2:
data.append(tag.text.strip())
for p in price:
data.append(p)
for r in review:
data.append(r)
But here's the problem, tag.text.strip() scrape rating numbers like here :
It will strip the number rating into alone value but some hotels don't have the same amout of ratings. Here's a hotel with 7 ratings, the default number is 8. Some have seven ratings, other six, and so on. So in the end, my dataframe is quite screwed. If the hotel doesn't have 8 ratings, the value will be shifted.
My question is : How to tell the script "if there is a value in this tag.text.strip(i) so put the value but if there isn't put None. And of course made that for the eight value.
I tried several things like :
for tag in my_table2:
for i in tag.text.strip()[i]:
if i:
data.append(i)
else:
data.append(None)
But unfortunately, that goes nowhere, so if you could help to figure out the answer, it would be awesome :)
If that could help you, I put link on Hotel that I'm scraping :
https://www.hostelworld.com/pwa/hosteldetails.php/Itaca-Hostel/Barcelona/1279?from=2020-11-21&to=2020-11-22&guests=1
The number ratings are at the end
Thank you.
A few suggestions:
Put your data in a dictionary. You don't have to assume that all tags are present and the order of the tags doesn't matter. You can get the labels and the corresponding ratings with
rating_labels = soup.find_all(class_=['rating-label body-3'])
rating_scores = soup.find_all(class_=['rating-score body-3'])
and then iterate over both lists with zip
move your driver outside of the loop, opening it once is enough
don't use wait but you use Selenium's wait functions. You can wait for a particular element to be present or populated with WebDriverWait(driver, 10).until(EC.presence_of_element_located(your_element)
https://selenium-python.readthedocs.io/waits.html
Cache your scraped HTML code to a file. It's faster for you and politer to the website you are scraping
import selenium
import selenium.webdriver
import time
import random
import os
from bs4 import BeautifulSoup
data = []
final_list = [
'https://www.hostelworld.com/pwa/hosteldetails.php/Itaca-Hostel/Barcelona/1279?from=2020-11-21&to=2020-11-22&guests=1',
'https://www.hostelworld.com/pwa/hosteldetails.php/Be-Ramblas-Hostel/Barcelona/435?from=2020-11-27&to=2020-11-28&guests=1'
]
# load your driver only once to save time
driver = selenium.webdriver.Chrome()
for url in final_list:
data.append({})
# cache the HTML code to the filesystem
# generate a filename from the URL where all non-alphanumeric characters (e.g. :/) are replaced with underscores _
filename = ''.join([s if s.isalnum() else '_' for s in url])
if not os.path.isfile(filename):
driver.get(url)
# better use selenium's wait functions here
time.sleep(random.randint(10, 20))
source = driver.page_source
with open(filename, 'w', encoding='utf-8') as f:
f.write(source)
else:
with open(filename, 'r', encoding='utf-8') as f:
source = f.read()
soup = BeautifulSoup(source, 'html.parser')
review = soup.find_all(class_='reviews')[-1]
try:
price = soup.find_all('span', attrs={'class':'price'})[-1]
except:
price = soup.find_all('span', attrs={'class':'price'})
data[-1]['name'] = soup.find_all(class_=['title-2'])[0].text.strip()
rating_labels = soup.find_all(class_=['rating-label body-3'])
rating_scores = soup.find_all(class_=['rating-score body-3'])
assert len(rating_labels) == len(rating_scores)
for label, score in zip(rating_labels, rating_scores):
data[-1][label.text.strip()] = score.text.strip()
data[-1]['price'] = price.text.strip()
data[-1]['review'] = review.text.strip()
The data can then be easily put in a nicely formatted table using Pandas
import pandas as pd
df = pd.DataFrame(data)
df
If some data is missing/incomplete, Pandas will replace it with 'NaN'
data.append(data[0].copy())
del(data[-1]['Staff'])
data[-1]['name'] = 'Incomplete Hostel'
pd.DataFrame(data)
I want to scrape the company info of all the companies from the below given URL and view their job details
URL : http://desiopt.com/search-results-jobs/
from selenium import webdriver
import bs4
import pandas as pd
from bs4 import BeautifulSoup
import re
driver = webdriver.Chrome(executable_path=r"C:/Users/Chandra Sekhar/Desktop/chrome-driver/chromedriver.exe")
titles=[]
driver.get("http://desiopt.com/search-results-jobs/")
content = driver.page_source
soup = BeautifulSoup(content)
for a in soup.findAll('div',attrs={'class':'listing-links'}):
info=a.find('div', attrs={'class':'userInfo'})
print(info.text)
titles.append(info.text)
df = pd.DataFrame({'Company info':titles})
df['Price'] = df['Price'].map(lambda x: re.sub(r'\W+', '', x))
df.to_csv('products1.csv', index=False)
Using the following url:
https://desiopt.com/search-results-jobs/?action=search&page=&listings_per_page=&view=list
Here's two parameter which you will edit page= and listings_per_page=:
Currently the website do have 37091 Jobs.
After my testing, I do see that listings_per_page is limited by 1000 per one page.
Example: https://desiopt.com/search-results-jobs/?action=search&page=1&listings_per_page=1000&view=list
So you will need to loop from page=1 to page=38 and set listings_per_page=1000
Which means 1000 result per page * 38 page = 38000
After that:
You will collect all links and pass it to list with condition to remove duplicates in case if you worry about sort. Otherwise just pass it to set which doesn't accept duplicates but it's don't care about sort. Then you can parse each url in list or set to collect the information.
By The Way, I'll loop over 371 page and each page include 100 items so i will get 37100 url (or less if the last page have little than 100 url) and remove the duplicates from, then parse:
import requests
from bs4 import BeautifulSoup
import csv
links = []
try:
for item in range(1, 372):
print(f"Extraction Page# {item}")
r = requests.get(
f"https://desiopt.com/search-results-jobs/?action=search&page={item}&listings_per_page=100&view=list")
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.findAll('span', attrs={'class': 'captions-field'}):
for a in item.findAll('a'):
a = a.get('href')
if a not in links:
links.append(a)
except KeyboardInterrupt:
print("Good Bye!")
exit()
data = []
try:
for link in links:
r = requests.get(link)
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.findAll('div', attrs={'class': 'compProfileInfo'}):
a = [a.text.strip() for a in item.findAll('span')]
if a[6] == '':
a[6] = 'N/A'
data.append(a[0:7:2])
except KeyboardInterrupt:
print("Good Bye!")
exit()
while True:
try:
with open('output.csv', 'w+', newline='') as file:
writer = csv.writer(file)
writer.writerow(['Name', 'Phone', 'Email', 'Website'])
writer.writerows(data)
print("Operation Completed")
except PermissionError:
print("Please Close The File")
continue
except KeyboardInterrupt:
print("Good Bye")
exit()
break
Result Can Be Viewed Here:
Click Here
The output is 1885 Rows because I've let the script to remove duplicated links for companies before i parsed.
Run Code Online: Click Here
I have a problem with writing the scraped data to a csv file. While the pages are loaded and the first part of the scripts works, the writing to csv causes a problem.
The question I have is: How can I write the data; Name, Home State and Backer state to a csv file? The following code only writes Category to the csv file.
Code:
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time
from datetime import datetime
from collections import OrderedDict
import re
browser = webdriver.Firefox()
browser.get('https://www.kickstarter.com/discover?ref=nav')
categories = browser.find_elements_by_class_name('category-container')
category_links = []
for category_link in categories:
#Each item in the list is a tuple of the category's name and its link.
category_links.append((str(category_link.find_element_by_class_name('f3').text),
category_link.find_element_by_class_name('bg-white').get_attribute('href')))
scraped_data = []
now = datetime.now()
counter = 1
for category in category_links:
browser.get(category[1])
browser.find_element_by_class_name('sentence-open').click()
time.sleep(2)
browser.find_element_by_id('category_filter').click()
time.sleep(2)
for i in range(27):
try:
time.sleep(2)
browser.find_element_by_id('category_'+str(i)).click()
time.sleep(2)
except:
pass
projects = []
for project_link in browser.find_elements_by_class_name('clamp-3'):
projects.append(project_link.find_element_by_tag_name('a').get_attribute('href'))
for counter, project in enumerate(projects):
page1 = urllib.request.urlopen(projects[counter])
soup1 = BeautifulSoup(page1, "lxml")
page2 = urllib.request.urlopen(projects[counter].split('?')[0]+'/community')
soup2 = BeautifulSoup(page2, "lxml")
time.sleep(2)
print(str(counter)+': '+project+'\nStatus: Started.')
project_dict = OrderedDict()
project_dict['Category'] = category[0]
browser.get(project)
project_dict['Name'] = soup1.find(class_='type-24 type-28-sm type-38-md navy-700 medium mb3').text
project_dict['Home State'] = soup1.find(class_='nowrap navy-700 flex items-center medium type-12').text
try:
project_dict['Backer State'] = soup2.find(class_='location-list-wrapper js-location-list-wrapper').text
except:
pass
print('Status: Done.')
counter+=1
scraped_data.append(project_dict)
later = datetime.now()
diff = later - now
print('The scraping took '+str(round(diff.seconds/60.0,2))+' minutes, and scraped '+str(len(scraped_data))+' projects.')
df = pd.DataFrame(scraped_data)
df.to_csv('kickstarter-data1.csv')
With the following code I try to get the home city and places where the backers are located from kickstarter. However, I keep running into the following error:
File "D:/location", line 60, in < module >
page1 = urllib.request.urlopen(projects[counter])
IndexError: list index out of range
Does someone have a more elegant solution to feed the page to urllib.request.urlopen? (see the lines in ** **)
code:
# coding: utf-8
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time
from datetime import datetime
from collections import OrderedDict
import re
browser = webdriver.Firefox()
browser.get('https://www.kickstarter.com/discover?ref=nav')
categories = browser.find_elements_by_class_name('category-container')
category_links = []
for category_link in categories:
#Each item in the list is a tuple of the category's name and its link.category_links.append((str(category_link.find_element_by_class_name('f3').text),
category_link.find_element_by_class_name('bg-white').get_attribute('href')))
scraped_data = []
now = datetime.now()
counter = 1
for category in category_links:
browser.get(category[1])
browser.find_element_by_class_name('sentence-open').click()
time.sleep(2)
browser.find_element_by_id('category_filter').click()
time.sleep(2)
for i in range(27):
try:
time.sleep(2)
browser.find_element_by_id('category_'+str(i)).click()
time.sleep(2)
except:
pass
#while True:
# try:
# browser.find_element_by_class_name('load_more').click()
# except:
# break
projects = []
for project_link in browser.find_elements_by_class_name('clamp-3'):
projects.append(project_link.find_element_by_tag_name('a').get_attribute('href'))
for project in projects:
**page1 = urllib.request.urlopen(projects[counter])**
soup1 = BeautifulSoup(page1, "lxml")
**page2 = urllib.request.urlopen(projects[counter].split('?')**[0]+'/community')
soup2 = BeautifulSoup(page2, "lxml")
time.sleep(2)
print(str(counter)+': '+project+'\nStatus: Started.')
project_dict = OrderedDict()
project_dict['Category'] = category[0]
browser.get(project)
project_dict['Name'] = soup1.find(class_='type-24 type-28-sm type-38-md navy-700 medium mb3').text
project_dict['Home State'] = str(soup1.find(class_='nowrap navy-700 flex items-center medium type-12').text)
try:
project_dict['Backer State'] = str(soup2.find(class_='location-list-wrapper js-location-list-wrapper').text)
except:
pass
print('Status: Done.')
counter+=1
scraped_data.append(project_dict)
later = datetime.now()
diff = later - now
print('The scraping took '+str(round(diff.seconds/60.0,2))+' minutes, and scraped '+str(len(scraped_data))+' projects.')
df = pd.DataFrame(scraped_data)
df.to_csv('kickstarter-data.csv')
If you only use counter to print the project status message, you can use range or enumerate instead. Here is an example with enumerate:
for counter, project in enumerate(projects):
... code ...
enumerate produces a tuple ( index, item ) , so the rest of your code should work fine as it is.
A fiew more things:
List index starts at 0 so when you use counter to access items you get an IndexError because you initiate counter with 1.
In the for loop you don't need projects[counter], just use project
My goal is to scrape data from the PGA website to extract all the golf course locations in the USA. I aim to scrape from the 907 pages the name, address, ownership, phone number, and website.
I have created the script below but when the CSV is created it produces errors. The CSV file created from the script has data repetitions of the first few pages and the pages of the website. It does not give the whole data of the 907 pages.
How can I fix my script so that it will scrape all 907 pages and produce a CSV with all the golf courses listed on the PGA website?
Below is my script:
import csv
import requests
from bs4 import BeautifulSoup
for i in range(907): # Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
g_data2=soup.find_all("div",{"class":"views-field-nothing"})
courses_list=[]
for item in g_data2:
try:
name=item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
except:
name=''
try:
address1=item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
except:
address1=''
try:
address2=item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
except:
address2=''
try:
website=item.contents[1].find_all("div",{"class":"views-field-website"})[0].text
except:
website=''
try:
Phonenumber=item.contents[1].find_all("div",{"class":"views-field-work-phone"})[0].text
except:
Phonenumber=''
course=[name,address1,address2,website,Phonenumber]
courses_list.append(course)
with open ('PGA_Data.csv','a') as file:
writer=csv.writer(file)
for row in courses_list:
writer.writerow(row)
Her is the code that you want. It will first parse the current page before going on to the next one. (There are some blank rows, I hope you can fix that yourself).
import csv
import requests
from bs4 import BeautifulSoup
def encode(l):
out = []
for i in l:
text = str(i).encode('utf-8')
out.append(''.join([i if ord(i) < 128 else ' ' for i in text])) #taken from Martjin Pieter's answer
# http://stackoverflow.com/questions/20078816/replace-non-ascii-characters-with-a-single-space/20078869#20078869
return out
courses_list = []
for i in range(5): # Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
g_data2=soup.find_all("div",{"class":"views-field-nothing"})
for item in g_data2:
try:
name = item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
except:
name=''
try:
address1= item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
except:
address1=''
try:
address2= item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
except:
address2=''
try:
website= item.contents[1].find_all("div",{"class":"views-field-website"})[0].text
except:
website=''
try:
Phonenumber= item.contents[1].find_all("div",{"class":"views-field-work-phone"})[0].text
except:
Phonenumber=''
course=[name,address1,address2,website,Phonenumber]
courses_list.append(encode(course))
with open ('PGA_Data.csv','a') as file:
writer=csv.writer(file)
for row in courses_list:
writer.writerow(row)
EDIT: After the inevitable problems of unicode encoding/decoding, I have modified the answer and it will (hopefully) work now. But http://nedbatchelder.com/text/unipain.html see this.