I have a problem with writing a Wikipedia's Web Crawler. This crawler needs to display "See also" section for a certain link. What is more, this crawler also has to show "See also" section for every link in the first use of "See also". For example: this Wiki page: https://en.wikipedia.org/wiki/Internet its "See also" section contains for example this page https://en.wikipedia.org/wiki/Crowdfunding, and this Crowdfunding page contains for example this: https://en.wikipedia.org/wiki/Angel_investor
This example is based on single links, but in the "see also" section there are like 10+ links and that is what I need to create. I also have to do it RECURSIVELY. Here's what my draft looks like but it gives me errors and it's not working like it should be (it's not even recursive) :D
#Import Libraries
import time #For Delay
import urllib.request #Extracting web pages
import re
#Defining pages
starting_page = "https://en.wikipedia.org/wiki/Spacetime"
seed_page = "https://en.wikipedia.org" #Crawling the English Wikipedia
#Downloading entire Web Document (Raw Page Content)
def download_page(url):
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = str(resp.read())
return respData
except Exception as e:
print(str(e))
#Extract the "See also" section elements
def extract_see_also(page):
if 'id="See_also">' in page:
start_see_also = page.find('id="See_also">')
start_list_items = page.find('<li>', start_see_also + 1)
end_see_also = page.find('<h2>', start_list_items + 1)
see_also_section = page[start_list_items: end_see_also]
pure_item_raw = (re.sub(r'<.+?>', '', see_also_section)).replace('\n', ',')
pure_item_raw2 = pure_item_raw.replace(',,', ',')
pure_item = pure_item_raw2.replace(',,', ',')
flag = 0
else:
pure_item = "No Related Links"
flag = 1
return pure_item, flag
#Getting all links with the help of 'get_next_links'
def get_all_links(page):
links = []
while True:
link, end_link = get_next_link(page)
if link == "no_links":
break
else:
links.append(link) #Append all the links in the list named 'Links'
#time.sleep(0.1)
page = page[end_link:]
return links
#Crawl Initiation
#Check for file type in URL so crawler does not crawl images and text files
def extension_scan(url):
a = ['.png','.jpg','.jpeg','.gif','.tif','.txt']
j = 0
while j < (len(a)):
if a[j] in url:
#print("There!")
flag2 = 1
break
else:
#print("Not There!")
flag2 = 0
j = j+1
#print(flag2)
return flag2
#URL parsing for incomplete or duplicate URLs
def url_parse(url):
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
url = url #.lower() #Make it lower case
s = urlparse(url) #parse the given url
seed_page_n = seed_page #.lower() #Make it lower case
#t = urlparse(seed_page_n) #parse the seed page (reference page)
i = 0
flag = 0
while i<=9:
if url == "/":
url = seed_page_n
flag = 0
elif not s.scheme:
url = "http://" + url
flag = 0
elif "#" in url:
url = url[:url.find("#")]
flag = 0
elif "?" in url:
url = url[:url.find("?")]
flag = 0
elif s.netloc == "":
url = seed_page + s.path
flag = 0
#elif "www" not in url:
# url = "www."[:7] + url[7:]
# flag = 0
elif url[len(url)-1] == "/":
url = url[:-1]
flag = 0
#elif s.netloc != t.netloc:
# url = url
# flag = 1
# break
else:
url = url
flag = 0
break
i = i+1
s = urlparse(url) #Parse after every loop to update the values of url parameters
return(url, flag)
t0 = time.time()
database = {} #Create a dictionary
#Main Crawl function that calls all the above function and crawls the entire site sequentially
def web_crawl():
to_crawl = [starting_page] #Define list name 'Seed Page'
#print(to_crawl)
crawled=[] #Define list name 'Seed Page'
#database = {} #Create a dictionary
#k = 0;
for k in range(0, 3):
i=0 #Initiate Variable to count No. of Iterations
while i<3: #Continue Looping till the 'to_crawl' list is not empty
urll = to_crawl.pop(0) #If there are elements in to_crawl then pop out the first element
urll,flag = url_parse(urll)
#print(urll)
flag2 = extension_scan(urll)
time.sleep(3)
#If flag = 1, then the URL is outside the seed domain URL
if flag == 1 or flag2 == 1:
pass #Do Nothing
else:
if urll in crawled: #Else check if the URL is already crawled
pass #Do Nothing
else: #If the URL is not already crawled, then crawl it and extract all the links from it
print("Link = " + urll)
raw_html = download_page(urll)
#print(raw_html)
see_also,flag2 = extract_see_also(raw_html)
print("Related Links = " + see_also)
crawled.append(urll)
#Remove duplicated from to_crawl
n = 1
j = 0
#k = 0
while j < (len(to_crawl)-n):
if to_crawl[j] in to_crawl[j+1:(len(to_crawl)-1)]:
to_crawl.pop(j)
n = n+1
else:
pass #Do Nothing
j = j+1
i=i+1
#print(to_crawl)
#print("Iteration No. = " + str(i))
#print("To Crawl = " + str(len(to_crawl)))
#print("Crawled = " + str(len(crawled)))
return ""
print (web_crawl())
t1 = time.time()
total_time = t1-t0
Related
Every time I try converting this to work as a while loop it cycles endlessly, any ideas would be very much appreciated. If I use a for loop it runs perfectly fine, so I would assume that using a while loop and iterating over an index value would achieve the same result.
from fuzzywuzzy import fuzz
import time
import fitz
from date_check import locate_date
## Each header is a list containing the header text and the form name ##
headers = ["header1", "Header1"]
## cast to lowercase ##
for header in headers:
header[0] = header[0].lower()
## One of the following is expected to be on the last page of the form ##
end_texts = ["Signature", "Signed"]
## cast to lowercase ##
for i in range(len(end_texts)):
end_texts[i] = end_texts[i].lower()
## set variables ##
forms = []
first_page = 0
header = ""
## Scan entire document for headers ##
def scan_document(document):
document = fitz.open(document)
first_page = False
last_page = False
index = 0
## This is the loop in question ##
for i in range(len(document)):
page = document[i]
text = page.get_text("text")
text = text.lower()
if first_page == False:
for header in headers:
if fuzz.partial_ratio(header[0], text) > 90:
first_page = i
## Find the date on the page ##
date = locate_date(text)
forms.append([date, header[1], first_page])
break
elif first_page != False and last_page == False:
for end_text in end_texts:
if end_text in text:
last_page = i
forms[index].append(last_page)
first_page = False
last_page = False
index += 1
break
## Return forms list containing first and last page of each form as well as the header ##
return(forms)
I tried using a while loop and iterating over an index, but the program hangs whenever I use it.
## set variables ##
forms = []
first_page = 0
header = ""
## Scan entire document for headers ##
def scan_document(document):
document = fitz.open(document)
first_page = False
last_page = False
page_num = 0
index = 0
while page_num <= len(document):
page = document[page_num]
text = page.get_text("text")
text = text.lower()
if first_page == False:
for header in headers:
if fuzz.partial_ratio(header[0], text) > 90:
first_page = page_num
## Find the date on the page ##
date = locate_date(text)
forms.append([date, header[1], first_page])
page_num += 1
break
elif first_page != False and last_page == False:
for end_text in end_texts:
if end_text in text:
last_page = page_num
forms[index].append(last_page)
first_page = False
last_page = False
index += 1
page_num += 1
break
else:
page_num += 1
## Return forms list containing first and last page of each form as well as the header ##
return(forms)```
There are some cases in which the page_num += 1 line is not reached, you can increase page_number value as first operation after entering in the while, but remember to use page_number - 1 when you use it.
## set variables ##
forms = []
first_page = 0
header = ""
## Scan entire document for headers ##
def scan_document(document):
document = fitz.open(document)
first_page = False
last_page = False
page_num = 0
index = 0
while page_num <= len(document):
page_num += 1
page = document[page_num - 1]
text = page.get_text("text")
text = text.lower()
if first_page == False:
for header in headers:
if fuzz.partial_ratio(header[0], text) > 90:
first_page = page_num - 1
## Find the date on the page ##
date = locate_date(text)
forms.append([date, header[1], first_page])
break
elif first_page != False and last_page == False:
for end_text in end_texts:
if end_text in text:
last_page = page_num - 1
forms[index].append(last_page)
first_page = False
last_page = False
index += 1
break
## Return forms list containing first and last page of each form as well as the header ##
return (forms)
You need to increase page_num in every iteration as some if statements won't be triggered.
I followed a youtube tutorial on web scraping to scrape this website https://books.toscrape.com/ but i'm getting an empty result
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
all_books = []
url = "http://books.toscrape.com/catalogue/page-1.html"
def get_page(url):
page = requests.get(url)
status = page.status_code
soup = bs(page.text, "lxml")
return [soup, status]
def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def extract_info(links):
for listing in listings:
bk_lnk = listing.find("h5").a.get("href")
base_url = "http://books.toscrape.com/catalogue"
cmplt_lnk = base_url + bk_lnk
links.append(cmplt_lnk)
return links
def extract_info(links):
for link in links:
res = requests.get(link).text
book_soup = bs(res, "lxml")
title = book_soup.find(class_ = "col-sm-6 product_main").h1. text.strip()
price = book_soup.find(class_ = "col-sm-6 product_main").p. text.strip()
book = {"title": title, "price": price}
all_books.append(book)
pg = 1
while True:
url = f"http://books.toscrape.com/catalogue/page-{pg}.html"
soup_status = get_page(url)
if soup_status[1] == 200:
print (f"scraping page {pg}")
extract_info(get_links(soup_status[0]))
pg += 1
else:
print("The End")
break
df = pd.DataFrame(all_books)
print (df)
here's the result am getting
Empty DataFrame
Columns: []
Index: []
my colab notebook link
https://colab.research.google.com/drive/1Lyvwt_WLpE9tqy1qheZg80N70CFSsk-E?usp=sharing
def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def extract_links():
for listing in listings:
bk_lnk = listing.find("h3").a.get("href")
base_url = "https://books.toscrape.com/catalogue/"
cmplt_lnk = base_url + bk_lnk
links.append(cmplt_lnk)
return links
return extract_links()
def extract_info(links):
for link in links:
res = requests.get(link).text
book_soup = bs(res, "lxml")
title = book_soup.find(class_ = "col-sm-6 product_main").h1.text.strip()
price = book_soup.find(class_ = "col-sm-6 product_main").p.text.strip()
book = {"title": title, "price": price}
all_books.append(book)
pg = 45
while True:
url = f"https://books.toscrape.com/catalogue/page-{pg}.html"
soup_status = get_page(url)
if soup_status[1] == 200:
print (f"scraping page {pg}")
extract_info(get_links(soup_status[0]))
pg += 1
else:
print("The End")
break
Your list is empty . Need to call your functions .. such as
Get_page(url) which should return a list which you can use soup in your subsequent function ..
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.flipkart.com/search?as=on&as-pos=1_1_ic_lapto&as-show=on&otracker=start&page=1&q=laptop&sid=6bo%2Fb5g&viewType=list")
c = r.content
soup = BeautifulSoup(c,"html.parser")
all = soup.find_all("div",{"class":"col _2-gKeQ"})
page_nr=soup.find_all("a",{"class":"_33m_Yg"})[-1].text
print(page_nr,"number of pages were found")
#all[0].find("div",{"class":"_1vC4OE _2rQ-NK"}).text
l=[]
base_url="https://www.flipkart.com/search?as=on&as-pos=1_1_ic_lapto&as-show=on&otracker=start&page=1&q=laptop&sid=6bo%2Fb5g&viewType=list"
for page in range(0,int(page_nr)*10,10):
print( )
r=requests.get(base_url+str(page)+".html")
c=r.content
#c=r.json()["list"]
soup=BeautifulSoup(c,"html.parser")
for item in all:
d ={}
#price
d["Price"] = item.find("div",{"class":"_1vC4OE _2rQ-NK"}).text
#Name
d["Name"] = item.find("div",{"class":"_3wU53n"}).text
for li in item.find_all("li",{"class":"_1ZRRx1"}):
if " EMI" in li.text:
d["EMI"] = li.text
else:
d["EMI"] = None
for li1 in item.find_all("li",{"class":"_1ZRRx1"}):
if "Special " in li1.text:
d["Special Price"] = li1.text
else:
d["Special Price"] = None
for val in item.find_all("li",{"class":"tVe95H"}):
if "Display" in val.text:
d["Display"] = val.text
elif "Warranty" in val.text:
d["Warrenty"] = val.text
elif "RAM" in val.text:
d["Ram"] = val.text
l.append(d)
import pandas
df = pandas.DataFrame(l)
This might work on standard pagination
i = 1
items_parsed = set()
loop = True
base_url = "https://www.flipkart.com/search?as=on&as-pos=1_1_ic_lapto&as-show=on&otracker=start&page={}&q=laptop&sid=6bo%2Fb5g&viewType=list"
while True:
page = requests.get(base_url.format(i))
items = requests.get(#yourelements#)
if not items:
break
for item in items:
#Scrap your item and once you sucessfully done the scrap, return the url of the parsed item into url_parsed (details below code) for example:
url_parsed = your_stuff(items)
if url_parsed in items_parsed:
loop = False
items_parsed.add(url_parsed)
if not loop:
break
i += 1
I formatted your URL where ?page=X with base_url.format(i) so it can iterate until you have no items found on the page OR sometimes you return on page 1 when you reached max_page + 1.
If above the maximum page you get the items you already parsed on the first page you can declare a set() and put the URL of every items you parsed and then check if you already parsed them.
Note that this is just an idea.
Since the page number in the URL is almost in the middle I'd apply a similar change to your code:
base_url="https://www.flipkart.com/search?as=on&as-pos=1_1_ic_lapto&as-show=on&otracker=start&page="
end_url ="&q=laptop&sid=6bo%2Fb5g&viewType=list"
for page in range(1, page_nr + 1):
r=requests.get(base_url+str(page)+end_url+".html")
You have access to only first 10 pages from initial URL.
You can make a loop from "&page=1" to "&page=26".
I want to create a piece of code that works as follows:
You feed it an URL, it looks on that webpage how many links there are, follows one, looks on that new webpage again, follows one link, and so on.
I have a piece of code that opens a web page, searches for links and creates a list from them:
import urllib
from bs4 import BeautifulSoup
list_links = []
page = raw_input('enter an url')
url = urllib.urlopen(page).read()
html = BeautifulSoup(url, 'html.parser')
for link in html.find_all('a'):
link = link.get('href')
list_links.append(link)
Next, I want user to decide which link to follow, so I have this:
link_number = len(list_links)
print 'enter a number between 0 and', (link_number)
number = raw_input('')
for number in number:
if int(number) < 0 or int(number) > link_number:
print "The End."
break
else:
continue
url_2 = urllib.urlopen(list_links[int(number)]).read()
Here my code crashes
Ideally, I would like to have an endless process (unsell user would stop it by entering a wrong number) like this: open the page -> count amount of links -> choose one -> follow this link and open new page -> count amount of links...
Can anybody help me?
You can try using this (sorry if it's not exactly pretty, I wrote it in a bit of a hurry):
import requests, random
from bs4 import BeautifulSoup as BS
from time import sleep
def main(url):
content = scraping_call(url)
if not content:
print "Couldn't get html..."
return
else:
links_list = []
soup = BS(content, 'html5lib')
for link in soup.findAll('a'):
try:
links_list.append(link['href'])
except KeyError:
continue
chosen_link_index = input("Enter a number between 0 and %d: " % len(links_list))
if not 0 < chosen_link_index <= len(links_list):
raise ValueError ('Number must be between 0 and %d: ' % len(links_list))
#script will crash here.
#If you want the user to try again, you can
#set up a nr of attempts, like in scraping_call()
else:
#if user wants to stop the infinite loop
next_step = raw_input('Continue or exit? (Y/N) ') or 'Y'
# default value is 'yes' so if u want to continue,
#just press Enter
if next_step.lower() == 'y':
main(links_list[chosen_link_index])
else:
return
def scraping_call(url):
attempt = 1
while attempt < 6:
try:
page = requests.get(url)
if page.status_code == 200:
result = page.content
else:
result = ''
except Exception,e:
result = ''
print 'Failed attempt (',attempt,'):', e
attempt += 1
sleep(random.randint(2,4))
continue
return result
if __name__ == '__main__':
main('enter the starting URL here')
Some of the links in a certain webpage can appear in a form of relative address and we need to take this into account.
This should do the trick. Works for python 3.4.
from urllib.request import urlopen
from urllib.parse import urljoin, urlsplit
from bs4 import BeautifulSoup
addr = input('enter an initial url: ')
while True:
html = BeautifulSoup(urlopen(addr).read(), 'html.parser')
list_links = []
num = 0
for link in html.find_all('a'):
url = link.get('href')
if not urlsplit(url).netloc:
url = urljoin(addr, url)
if urlsplit(url).scheme in ['http', 'https']:
print("%d : %s " % (num, str(url)))
list_links.append(url)
num += 1
idx = int(input("enter an index between 0 and %d: " % (len(list_links) - 1)))
if not 0 <= idx < len(list_links):
raise ValueError('Number must be between 0 and %d: ' % len(list_links))
addr = list_links[idx]
I want to know about how I can collect all the URL's and from the page source using beautiful soup and can visit all of them one by one in the google search results and move to next google index pages.
here is the URL https://www.google.com/search?q=site%3Awww.rashmi.com&rct=j that I want to collect and screen shot here http://www.rashmi.com/blog/wp-content/uploads/2014/11/screencapture-www-google-com-search-1433026719960.png
here is the code I'm trying
def getPageLinks(page):
links = []
for link in page.find_all('a'):
url = link.get('href')
if url:
if 'www.rashmi.com/' in url:
links.append(url)
return links
def Links(url):
pUrl = urlparse(url)
return parse_qs(pUrl.query)[0]
def PagesVisit(browser, printInfo):
pageIndex = 1
visited = []
time.sleep(5)
while True:
browser.get("https://www.google.com/search?q=site:www.rashmi.com&ei=50hqVdCqJozEogS7uoKADg" + str(pageIndex)+"&start=10&sa=N")
pList = []
count = 0
pageIndex += 1
Try this it should work.
def getPageLinks(page):
links = []
for link in page.find_all('a'):
url = link.get('href')
if url:
if 'www.rashmi.com/' in url:
links.append(url)
return links
def Links(url):
pUrl = urlparse(url)
return parse_qs(pUrl.query)
def PagesVisit(browser, printInfo):
start = 0
visited = []
time.sleep(5)
while True:
browser.get("https://www.google.com/search?q=site:www.rashmi.com&ei=V896VdiLEcPmUsK7gdAH&" + str(start) + "&sa=N")
pList = []
count = 0
# Random sleep to make sure everything loads
time.sleep(random.randint(1, 5))
page = BeautifulSoup(browser.page_source)
start +=10
if start ==500:
browser.close()