I want to know about how I can collect all the URL's and from the page source using beautiful soup and can visit all of them one by one in the google search results and move to next google index pages.
here is the URL https://www.google.com/search?q=site%3Awww.rashmi.com&rct=j that I want to collect and screen shot here http://www.rashmi.com/blog/wp-content/uploads/2014/11/screencapture-www-google-com-search-1433026719960.png
here is the code I'm trying
def getPageLinks(page):
links = []
for link in page.find_all('a'):
url = link.get('href')
if url:
if 'www.rashmi.com/' in url:
links.append(url)
return links
def Links(url):
pUrl = urlparse(url)
return parse_qs(pUrl.query)[0]
def PagesVisit(browser, printInfo):
pageIndex = 1
visited = []
time.sleep(5)
while True:
browser.get("https://www.google.com/search?q=site:www.rashmi.com&ei=50hqVdCqJozEogS7uoKADg" + str(pageIndex)+"&start=10&sa=N")
pList = []
count = 0
pageIndex += 1
Try this it should work.
def getPageLinks(page):
links = []
for link in page.find_all('a'):
url = link.get('href')
if url:
if 'www.rashmi.com/' in url:
links.append(url)
return links
def Links(url):
pUrl = urlparse(url)
return parse_qs(pUrl.query)
def PagesVisit(browser, printInfo):
start = 0
visited = []
time.sleep(5)
while True:
browser.get("https://www.google.com/search?q=site:www.rashmi.com&ei=V896VdiLEcPmUsK7gdAH&" + str(start) + "&sa=N")
pList = []
count = 0
# Random sleep to make sure everything loads
time.sleep(random.randint(1, 5))
page = BeautifulSoup(browser.page_source)
start +=10
if start ==500:
browser.close()
Related
I want to download multiple articles at once and save into a csv file.
I have set a limit for 5 pages of the website on the specific topic, but I am not sure it is suppose to be so long. Each page has 10 articles, so it is supposed to be for 50 articles in total.
The code has been running for more than one hour now, is this normal?
Thank you.
I am using Google Colab for the code:
#Generalized function to get all news-related articles from a Nasdaq webpage
def get_news_urls(links_site):
resp = requests.get(links_site)
if not resp.ok:
return None
html = resp.content
soup = BeautifulSoup(html , 'lxml')
links = soup.find_all('a')
urls = [link.get('href') for link in links]
urls = [url for url in urls if url is not None]
news_urls = [url for url in urls if '/article/' in url]
return news_urls
def scrape_all_articles(ticker , upper_page_limit = 5):
landing_site = 'http://www.nasdaq.com/symbol/' + ticker + '/news-headlines'
all_news_urls = get_news_urls(landing_site)
current_urls_list = all_news_urls.copy()
index = 2
while (current_urls_list is not None) and (current_urls_list != []) and \
(index <= upper_page_limit):
current_site = landing_site + '?page=' + str(index)
current_urls_list = get_news_urls(current_site)
all_news_urls = all_news_urls + current_urls_list
index = index + 1
all_news_urls = list(set(all_news_urls))
#Now, we have a list of urls, we need to actually scrape the text
all_articles = [scrape_news_text(news_url) for news_url in all_news_urls]
return all_articles
nflx_articles = scrape_all_articles('nflx' , 5)
I am trying to crawl multiple pages of a website. But the program can only crawl the first page.
import requests
from bs4 import BeautifulSoup
import re
import json
import time
def make_soup(url):
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
pattern = re.compile(r'window.__WEB_CONTEXT__={pageManifest:(\{.*\})};')
script = soup.find("script", text=pattern)
jsonData = pattern.search(script.text).group(1)
pattern_number = re.compile(r'\"[0-9]{9,12}\":(\{\"data\":\{\"cachedFilters\":(.*?)\}\}),\"[0-9]{9,11}\"')
jsonData2 = pattern_number.search(jsonData).group(1)
dictData = json.loads(jsonData2)
return dictData
def get_reviews(dictData):
""" Return a list of five dicts with reviews.
"""
all_dictionaries = []
for data in dictData['data']['locations']:
for reviews in data['reviewListPage']['reviews']:
review_dict = {}
review_dict["reviewid"] = reviews['id']
review_dict["reviewurl"] = reviews['absoluteUrl']
review_dict["reviewlang"] = reviews['language']
review_dict["reviewdate"] = reviews['createdDate']
userProfile = reviews['userProfile']
review_dict["author"] = userProfile['displayName']
all_dictionaries.append(review_dict)
return all_dictionaries
def main():
url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
review_list = get_reviews(dictData) # list with five dicts
#print(review_list)
page_number = 5
while page_number <= 260: # number in the URL
next_url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
review_list2 = get_reviews(dictData)
print(review_list2)
page_number += 5
time.sleep(0.5)
if __name__ == "__main__":
main()
And I'm not sure if I can crawl multiple pages with this URL. On the website there are 54 pages, but in the URL I always have to add the number 5, like this:
Page 1
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS
Page2
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or5-Coronado_Hotel-Zurich.html#REVIEWS
Page3
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or10-Coronado_Hotel-Zurich.html#REVIEWS
I don't know if this is a good idea.
Do you have any suggestions? Thank you in advance!
You assing new url to next_url but you use url to read page.
next_url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
You have to rename variable
url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
url = 'https://www.tripadvisor.ie/Attraction_Review-g295424-d2038312-Reviews-Global_Village-Dubai_Emirate_of_Dubai.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
def get_links():
review_links = []
for review_link in soup.find_all('a', {'class':'title'},href=True):
review_link = review_link['href']
review_links.append(review_link)
return review_links
link = 'https://www.tripadvisor.ie'
review_urls = []
for i in get_links():
review_url = link + i
print (review_url)
review_urls.append(review_url)
Here this code to save all the hyperlinks present on this webpage - but I want to scrape all the hyperlinks on the pages till 319. Not able to implement when pagination is disabled
There is a param you can change in the url to loop and get all the reviews.
So I just added a loop and requests all the urls
def get_page(index):
url = "https://www.tripadvisor.ie/Attraction_Review-g295424-d2038312-Reviews-or{}-Global_Village-Dubai_Emirate_of_Dubai.html".format(str(index))
html = requests.get(url)
page = soup(html.text, 'html.parser')
return page
nb_review = 3187
for i in range(0, nb_review, 10):
page = get_page(i)
The full code using your snippet is:
from bs4 import BeautifulSoup as soup
import requests
def get_page(index):
url = "https://www.tripadvisor.ie/Attraction_Review-g295424-d2038312-Reviews-or{}-Global_Village-Dubai_Emirate_of_Dubai.html".format(str(index))
html = requests.get(url)
page = soup(html.text, 'html.parser')
return page
def get_links(page):
review_links = []
for review_link in page.find_all('a', {'class':'title'},href=True):
review_link = review_link['href']
review_links.append(review_link)
return review_links
link = 'https://www.tripadvisor.ie'
review_urls = []
nb_review = 3187
for i in range(0, nb_review, 10):
page = get_page(i)
for i in get_links(page):
review_url = link + i
review_urls.append(review_url)
print(len(review_urls))
OUTPUT:
3187
EDIT:
You can obviously scrape the first page and get the review number to upgrade the code to make it more customizable
Im trying to open the links that contain certain words on that page. And if the words are present on that page such as "engineering" then return the link if not pass.
here is what I have so far: The inputs I put are engineering and location is north york
import requests
from bs4 import BeautifulSoup
import webbrowser
import time
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
base_url = 'https://ca.indeed.com/'
r = requests.get(url)
rcontent = r.content
prettify = BeautifulSoup(rcontent, "html.parser")
filter_words = ['chemical engineering', 'instrumentation', 'QA']
all_job_url = []
filtered_job_links = []
http_flinks = []
flinks = []
def get_all_joblinks(): # obtains all the links on the search page
for tag in prettify.find_all('a', {'data-tn-element':"jobTitle"}):
link = tag['href']
all_job_url.append(link)
def filter_links():
for eachurl in all_job_url: # iterates through each link
rurl = requests.get(base_url + eachurl)
content = rurl.content
soup = BeautifulSoup(content, "html.parser")
summary = soup.get_text()
#supposed to filter links based on certain words within text on link page
if any(word in summary for word in filter_words):
for filtered_link in soup.find_all('link', {'rel':'canonical'}):
flink = filtered_link['href'] # obtains only filtered links
if "http:" in flink:
http_flinks.append(flink)
print(http_flinks)
else:
flinks.append(flink)
#website = webbrowser.open_new(base_url + flink)
time.sleep(3)
print(flinks)
else:
print("nothing")
pass
def search_job():
while True:
if prettify.select('div.no_results'):
print("no job matches found")
break
else:
# opens the web page of job search if entries are found
website = webbrowser.open_new(url)
break
get_all_joblinks()
filter_links()
I have written a python code to fetch the web-page corresponding to a given url, and parses all the links on that page into a repository of links. Next, it fetches the contents of any of the url from the repository just created, parses the links from this new content into the repository and continues this process for all links in the repository until stopped or after a given number of links are fetched.
Here code:
import BeautifulSoup
import urllib2
import itertools
import random
class Crawler(object):
"""docstring for Crawler"""
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup.BeautifulSoup(html_code)
page_links = []
try :
page_links = itertools.ifilter( # Only deal with absolute links
lambda href: 'http://' in href,
( a.get('href') for a in self.soup.findAll('a') ) )
except Exception: # Magnificent exception handling
pass
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
while len(self.visited_links) < 3 or (self.visited_links == self.links):
self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
This code does not fetch internal links (only absolute formed hyperlinks)
How to fetch Internal links that starts with '/' or '#' or '.'
Well, your code kind of already tells you what's going on. In your lambda you are only grabbing absolute links that start with http:// (which you are not grabbing https FWIW). You should grab all of the links and check to see if they start with http+ or not. If they don't, then they are a relative link, and since you know what the current_page is then you can use that to create an absolute link.
Here's a modification to your code. Excuse my Python as it's a little rusty, but I ran it and it worked in Python 2.7 for me. You'll want to clean it up and add some edge/error detection, but you get the gist:
#!/usr/bin/python
from bs4 import BeautifulSoup
import urllib2
import itertools
import random
import urlparse
class Crawler(object):
"""docstring for Crawler"""
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup(html_code)
page_links = []
try :
for link in [h.get('href') for h in self.soup.find_all('a')]:
print "Found link: '" + link + "'"
if link.startswith('http'):
page_links.append(link)
print "Adding link" + link + "\n"
elif link.startswith('/'):
parts = urlparse.urlparse(self.current_page)
page_links.append(parts.scheme + '://' + parts.netloc + link)
print "Adding link " + parts.scheme + '://' + parts.netloc + link + "\n"
else:
page_links.append(self.current_page+link)
print "Adding link " + self.current_page+link + "\n"
except Exception, ex: # Magnificent exception handling
print ex
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
while len(self.visited_links) < 3 or (self.visited_links == self.links):
self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
chage condition in lambda:
page_links = itertools.ifilter( # Only deal with absolute links
lambda href: 'http://' in href or href.startswith('/') or href.startswith('#') or href.startswith('.'),
( a.get('href') for a in self.soup.findAll('a') ) )