Get the lists of things to do from tripadvisor - python

how to get the 'things to do' list? I am new to webscraping and i don't know how to loop through each page to get the href of all 'things to do'?tell me where i am doing wrong?Any help would be highly apreciated. Thanks in advance.
import requests
import re
from bs4 import BeautifulSoup
from urllib.request import urlopen
offset = 0
url = 'https://www.tripadvisor.com/Attractions-g255057-Activities-oa' + str(offset) + '-Canberra_Australian_Capital_Territory-Hotels.html#ATTRACTION_LIST_CONTENTS'
urls = []
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
for link in soup.find_all('a', {'last'}):
page_number = link.get('data-page-number')
last_offset = int(page_number) * 30
print('last offset:', last_offset)
for offset in range(0, last_offset, 30):
print('--- page offset:', offset, '---')
url = 'https://www.tripadvisor.com/Attractions-g255057-oa' + str(offset) + '-Canberra_Australian_Capital_Territory-Hotels.html#ATTRACTION_LIST_CONTENTS'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
for link in soup.find_all('a', {'property_title'}):
iurl='https://www.tripadvisor.com/Attraction_Review-g255057' + link.get('href')
print(iurl)
Basically i want the href of each 'things to do'.
My desired output for 'things to do' is:
https://www.tripadvisor.com/Attraction_Review-g255057-d3377852-Reviews-Weston_Park-Canberra_Australian_Capital_Territory.html
https://www.tripadvisor.com/Attraction_Review-g255057-d591972-Reviews-Canberra_Museum_and_Gallery-Canberra_Australian_Capital_Territory.html
https://www.tripadvisor.com/Attraction_Review-g255057-d312426-Reviews-Lanyon_Homestead-Canberra_Australian_Capital_Territory.html
https://www.tripadvisor.com/Attraction_Review-g255057-d296666-Reviews-Australian_National_University-Canberra_Australian_Capital_Territory.html
Like in below example i used this code for getting the href of each restaurant in canberra city
my code for restauranr which works perfectly is:
import requests
import re
from bs4 import BeautifulSoup
from urllib.request import urlopen
with requests.Session() as session:
for offset in range(0, 1050, 30):
url = 'https://www.tripadvisor.com/Restaurants-g255057-oa{0}-Canberra_Australian_Capital_Territory.html#EATERY_LIST_CONTENTS'.format(offset)
soup = BeautifulSoup(session.get(url).content, "html.parser")
for link in soup.select('a.property_title'):
iurl = 'https://www.tripadvisor.com/' + link.get('href')
print(iurl)
the output of restaurant code is:
https://www.tripadvisor.com/Restaurant_Review-g255057-d1054676-Reviews-Lanterne_Rooms-Canberra_Australian_Capital_Territory.html
https://www.tripadvisor.com/Restaurant_Review-g255057-d755055-Reviews-Courgette_Restaurant-Canberra_Australian_Capital_Territory.html
https://www.tripadvisor.com/Restaurant_Review-g255057-d6893178-Reviews-Pomegranate-Canberra_Australian_Capital_Territory.html
https://www.tripadvisor.com/Restaurant_Review-g255057-d7262443-Reviews-Les_Bistronomes-Canberra_Australian_Capital_Territory.html
.
.
.
.

Ok , it's not that hard, you just have to know which tags to use .
Let me explain with this example :
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.tripadvisor.com/' ## we need this to join the links later ##
main_page = 'https://www.tripadvisor.com/Attractions-g255057-Activities-oa{}-Canberra_Australian_Capital_Territory-Hotels.html#ATTRACTION_LIST_CONTENTS'
links = []
## get the initial page to find the number of pages ##
r = requests.get(main_page.format(0))
soup = BeautifulSoup(r.text, "html.parser")
## select the last page from the list of pages ('a', {'class':'pageNum taLnk'}) ##
last_page = max([ int(page.get('data-offset')) for page in soup.find_all('a', {'class':'pageNum taLnk'}) ])
## now iterate over that range (first page, last page, number of links), and extract the links from each page ##
for i in range(0, last_page + 30, 30):
page = main_page.format(i)
soup = BeautifulSoup(requests.get(page).text, "html.parser") ## get the next page and parse it with BeautifulSoup ##
## get the hrefs from ('div', {'class':'listing_title'}), and join them with base_url to make the links ##
links += [ base_url + link.find('a').get('href') for link in soup.find_all('div', {'class':'listing_title'}) ]
for link in links :
print(link)
That gives us 8 pages and 212 links in total ( 30 on each page, 2 on the last ) .
I hope this clears things up a bit

Related

Python web scraping multiple pages

I am scraping all the words from website Merriam-Webster.
I want to scrape all pages starting from a-z and all pages within them and save them to a text file. The problem i'm having is i only get first result of the table instead of all. I know that this is a large amount of text (around 500k) but i'm doing it for educating myself.
CODE:
import requests
from bs4 import BeautifulSoup as bs
URL = 'https://www.merriam-webster.com/browse/dictionary/a/'
page = 1
# for page in range(1, 75):
req = requests.get(URL + str(page))
soup = bs(req.text, 'html.parser')
containers = soup.find('div', attrs={'class', 'entries'})
table = containers.find_all('ul')
for entries in table:
links = entries.find_all('a')
name = links[0].text
print(name)
Now what i want is to get all the entries from this table, but instead i only get the first entry.
I'm kinda stuck here so any help would be appreciated.
Thanks
https://www.merriam-webster.com/browse/medical/a-z
https://www.merriam-webster.com/browse/legal/a-z
https://www.merriam-webster.com/browse/dictionary/a-z
https://www.merriam-webster.com/browse/thesaurus/a-z
To get all entries, you can use this example:
import requests
from bs4 import BeautifulSoup
url = 'https://www.merriam-webster.com/browse/dictionary/a/'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
for a in soup.select('.entries a'):
print('{:<30} {}'.format(a.text, 'https://www.merriam-webster.com' + a['href']))
Prints:
(a) heaven on earth https://www.merriam-webster.com/dictionary/%28a%29%20heaven%20on%20earth
(a) method in/to one's madness https://www.merriam-webster.com/dictionary/%28a%29%20method%20in%2Fto%20one%27s%20madness
(a) penny for your thoughts https://www.merriam-webster.com/dictionary/%28a%29%20penny%20for%20your%20thoughts
(a) quarter after https://www.merriam-webster.com/dictionary/%28a%29%20quarter%20after
(a) quarter of https://www.merriam-webster.com/dictionary/%28a%29%20quarter%20of
(a) quarter past https://www.merriam-webster.com/dictionary/%28a%29%20quarter%20past
(a) quarter to https://www.merriam-webster.com/dictionary/%28a%29%20quarter%20to
(all) by one's lonesome https://www.merriam-webster.com/dictionary/%28all%29%20by%20one%27s%20lonesome
(all) choked up https://www.merriam-webster.com/dictionary/%28all%29%20choked%20up
(all) for the best https://www.merriam-webster.com/dictionary/%28all%29%20for%20the%20best
(all) in good time https://www.merriam-webster.com/dictionary/%28all%29%20in%20good%20time
...and so on.
To scrape multiple pages:
url = 'https://www.merriam-webster.com/browse/dictionary/a/{}'
for page in range(1, 76):
soup = BeautifulSoup(requests.get(url.format(page)).content, 'html.parser')
for a in soup.select('.entries a'):
print('{:<30} {}'.format(a.text, 'https://www.merriam-webster.com' + a['href']))
EDIT: To get all pages from A to Z:
import requests
from bs4 import BeautifulSoup
url = 'https://www.merriam-webster.com/browse/dictionary/{}/{}'
for char in range(ord('a'), ord('z')+1):
page = 1
while True:
soup = BeautifulSoup(requests.get(url.format(chr(char), page)).content, 'html.parser')
for a in soup.select('.entries a'):
print('{:<30} {}'.format(a.text, 'https://www.merriam-webster.com' + a['href']))
last_page = soup.select_one('[aria-label="Last"]')['data-page']
if last_page == '':
break
page += 1
EDIT 2: To save to file:
import requests
from bs4 import BeautifulSoup
url = 'https://www.merriam-webster.com/browse/dictionary/{}/{}'
with open('data.txt', 'w') as f_out:
for char in range(ord('a'), ord('z')+1):
page = 1
while True:
soup = BeautifulSoup(requests.get(url.format(chr(char), page)).content, 'html.parser')
for a in soup.select('.entries a'):
print('{:<30} {}'.format(a.text, 'https://www.merriam-webster.com' + a['href']))
print('{}\t{}'.format(a.text, 'https://www.merriam-webster.com' + a['href']), file=f_out)
last_page = soup.select_one('[aria-label="Last"]')['data-page']
if last_page == '':
break
page += 1
I think you need another loop:
for entries in table:
links = entries.find_all('a')
for name in links:
print(name.text)

Iterating over urls fails to find correct href in Python using BeautifulSoup

I am iterating through the website in the code. The following is what my code does. Loops through the 52 pages and gets the link to each URLs.
Then it iterates through those URLs and tries to get the link for the English Translation. if you see the Mongolian website, it has a section "Орчуулга" on the top right and it has "English" underneath - that is the link to the English translation.
However, my code fails to grab the link for the english translation and gives a wrong url.
Below is a sample output for the first article.
1
{'https://mn.usembassy.gov/mn/2020-naadam-mn/': 'https://mn.usembassy.gov/mn/sitemap-mn/'}
The expected output for the first page should be
1
{'https://mn.usembassy.gov/mn/2020-naadam-mn/': 'https://mn.usembassy.gov/2020-naadam/'}
Below is my code
import requests
from bs4 import BeautifulSoup
url = 'https://mn.usembassy.gov/mn/news-events-mn/page/{page}/'
urls = []
for page in range(1, 53):
print(str(page) + "/52")
soup = BeautifulSoup(requests.get(url.format(page=page)).content, 'html.parser')
for h in soup.find_all('h2'):
a = h.find('a')
urls.append(a.attrs['href'])
print(urls)
i = 0
bilingual_dict = {}
for url in urls:
i += 1
print(i)
soup = BeautifulSoup(requests.get(url.format(page=url)).content, 'html.parser')
for div in soup.find_all('div', class_='translations_sidebar'):
for ul in soup.find_all('ul'):
for li in ul.find_all('li'):
a = li.find('a')
bilingual_dict[url] = a['href']
print(bilingual_dict)
print(bilingual_dict)
This script will print link to english translation:
import requests
from bs4 import BeautifulSoup
url = 'https://mn.usembassy.gov/mn/2020-naadam-mn/'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
link = soup.select_one('a[hreflang="en"]')
print(link['href'])
Prints:
https://mn.usembassy.gov/2020-naadam/
Complete code: (Where there isn't link to english translation, the value is set to None)
import requests
from bs4 import BeautifulSoup
from pprint import pprint
url = 'https://mn.usembassy.gov/mn/news-events-mn/page/{page}/'
urls = []
for page in range(1, 53):
print('Page {}...'.format(page))
soup = BeautifulSoup(requests.get(url.format(page=page)).content, 'html.parser')
for h in soup.find_all('h2'):
a = h.find('a')
urls.append(a.attrs['href'])
pprint(urls)
bilingual_dict = {}
for url in urls:
print(url)
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
link = soup.select_one('a[hreflang="en"]')
bilingual_dict[url] = link['href'] if link else None
pprint(bilingual_dict)

Following links with a second request - Web crawler

Please bear with me. I am quite new at Python - but having a lot of fun. I am trying to code a web crawler that crawls through results from a travel website. I have managed to extract all the relevant links from the main page. And now I want Python to follow each of the links and gather the pieces of information from each of those pages. But I am stuck. Hope you can give me a hint.
Here is my code:
import requests
from bs4 import BeautifulSoup
import urllib, collections
Spider =1
def trade_spider(max_pages):
RegionIDArray = {737: "London"}
for reg in RegionIDArray:
page = -1
r = requests.get("https://www.viatorcom.de/London/d" +str(reg) +"&page=" + str(page) , verify = False)
soup = BeautifulSoup(r.content, "lxml")
g_data = soup.find_all("h2", {"class": "mtm mbn card-title"})
for item in g_data:
Deeplink = item.find_all("a")
for t in set(t.get("href") for t in Deeplink):
Deeplink_final = t
print(Deeplink_final) #The output shows all the links that I would like to follow and gather information from.
trade_spider(1)
Output:
/de/7132/London-attractions/Stonehenge/d737-a113
/de/7132/London-attractions/Tower-of-London/d737-a93
/de/7132/London-attractions/London-Eye/d737-a1400
/de/7132/London-attractions/Thames-River/d737-a1410
The output shows all the links that I would like to follow and gather information from.
Next step in my code:
import requests
from bs4 import BeautifulSoup
import urllib, collections
Spider =1
def trade_spider(max_pages):
RegionIDArray = {737: "London"}
for reg in RegionIDArray:
page = -1
r = requests.get("https://www.viatorcom.de/London/d" +str(reg) +"&page=" + str(page) , verify = False)
soup = BeautifulSoup(r.content, "lxml")
g_data = soup.find_all("h2", {"class": "mtm mbn card-title"})
for item in g_data:
Deeplink = item.find_all("a")
for t in set(t.get("href") for t in Deeplink):
Deeplink_final = t
trade_spider(1)
def trade_spider2(max_pages):
r = requests.get("https://www.viatorcom.de" + Deeplink_final, verify = False)
soup = BeautifulSoup(r.content, "lxml")
print(soup)
trade_spider2(9)
I would like to append the initally crawled output to my second request. But this doesnt work.Hope you can give me a hint.
This should help.
import requests
from bs4 import BeautifulSoup
import urllib, collections
Spider =1
def trade_spider2(Deeplink_final):
r = requests.get("https://www.viatorcom.de" + Deeplink_final, verify = False)
soup = BeautifulSoup(r.content, "lxml")
print(soup)
def trade_spider(max_pages):
RegionIDArray = {737: "London"}
for reg in RegionIDArray:
page = -1
r = requests.get("https://www.viatorcom.de/London/d" +str(reg) +"&page=" + str(page) , verify = False)
soup = BeautifulSoup(r.content, "lxml")
g_data = soup.find_all("h2", {"class": "mtm mbn card-title"})
for item in g_data:
Deeplink = item.find_all("a")
for Deeplink_final in set(t.get("href") for t in Deeplink):
trade_spider2(Deeplink_final)
trade_spider(1)

Soup.find_all is only returning Some of the results in Python 3.5.1

I'm trying to get all of the urls for thumbnails from my webpage that have the class = "thumb", but soup.find_all is only printing the most recent 22 or so.
Here is the Code:
import requests
from bs4 import BeautifulSoup
r = requests.get("http://rayleighev.deviantart.com/gallery/44021661/Reddit")
soup = BeautifulSoup(r.content, "html.parser")
links = soup.find_all("a", {'class' : "thumb"})
for link in links:
print(link.get("href"))
I think you meant to ask about following the pagination and grabbing all the links in a list. Here is the implementation of that idea - use the offset parameter and grab links until there are no more links present incrementing the offset by 24 (number of links per page):
import requests
from bs4 import BeautifulSoup
offset = 0
links = []
with requests.Session() as session:
while True:
r = session.get("http://rayleighev.deviantart.com/gallery/44021661/Reddit?offset=%d" % offset)
soup = BeautifulSoup(r.content, "html.parser")
new_links = [link["href"] for link in soup.find_all("a", {'class': "thumb"})]
# no more links - break the loop
if not new_links:
break
links.extend(new_links)
print(len(links))
offset += 24
print(links)

Scraping multiple paginated links with BeautifulSoup and Requests

Python Beginner here. I'm trying to scrape all products from one category on dabs.com. I've managed to scrape all products on a given page, but I'm having trouble iterating over all the paginated links.
Right now, I've tried to isolate all the pagination buttons with the span class='page-list" but even that isn't working. Ideally, I would like to make the crawler keep clicking next until it has scraped all products on all pages. How can I do this?
Really appreciate any input
from bs4 import BeautifulSoup
import requests
base_url = "http://www.dabs.com"
page_array = []
def get_pages():
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")
page_list = soup.findAll('span', class="page-list")
pages = page_list[0].findAll('a')
for page in pages:
page_array.append(page.get('href'))
def scrape_page(page):
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")
Product_table = soup.findAll("table")
Products = Product_table[0].findAll("tr")
if len(soup.findAll('tr')) > 0:
Products = Products[1:]
for row in Products:
cells = row.find_all('td')
data = {
'description' : cells[0].get_text(),
'price' : cells[1].get_text()
}
print data
get_pages()
[scrape_page(base_url + page) for page in page_array]
Their next page button has a title of "Next" you could do something like:
import requests
from bs4 import BeautifulSoup as bs
url = 'www.dabs.com/category/computing/11001/'
base_url = 'http://www.dabs.com'
r = requests.get(url)
soup = bs(r.text)
elm = soup.find('a', {'title': 'Next'})
next_page_link = base_url + elm['href']
Hope that helps.

Categories