Yellow Pages Scraper in Python Not working - python

I am trying to scrape data from Yellow Pages. I used this scraper many times, but it has recently stopped.
Got this error
'NoneType' object has no attribute 'group' 0 results found
can anyone please help me to fix this problem
Where am I going wrong on this?
import requests
import requests_random_user_agent
import urllib.parse
from bs4 import BeautifulSoup
import re
from math import ceil
import csv
import os
import sys
import subprocess
from os import system, name
import time
from tqdm import tqdm
class Scraper:
def __init__(self,keyword,location):
self.keyword=keyword
self.location=location
self.params = urllib.parse.urlencode({"search_terms": self.keyword, "geo_location_terms": self.location})
def get_info(self, link):
try:
r = requests.get(link)
html = BeautifulSoup(r.content, "html.parser")
except:
return False
try:
name = html.find('h1').text
except:
name = None
try:
phone = html.find(class_='phone').text
except:
phone = None
try:
website = html.find('a',class_='primary-btn website-link')["href"]
if len(website.split("?")) > 1:
website = website.split("?")[0]
except:
website = None
try:
email = html.find('a', class_='email-business')["href"].split(":")[1]
except:
email=None
try:
address = html.find('h2',class_='address').text
except:
address=None
return {"name": name, "email": email, "phone": phone, "address": address, "website":website}
def get_num_pages(self):
try:
url = f"https://www.yellowpages.com/search?{self.params}"
response = requests.get(url)
html = BeautifulSoup(response.content, "html.parser")
pagination = html.find(class_="pagination")
if not pagination:
pagination = 1
links = html.select("a[class='business-name']")
num_results = 0
for l in links:
try:
l["data-analytics"]
num_results += 1
except:
continue
return num_results, pagination
num_results = int(re.search('We found(.*?)results',pagination.text).group(1))
return num_results, int(ceil(int(num_results) / 30))
except Exception as e:
print(e)
return False, False
def get_links(self, page):
try:
url = f"https://www.yellowpages.com/search?{self.params}&page={page}"
response = requests.request("GET", url, timeout=10)
html = BeautifulSoup(response.content, "html.parser")
links = html.select("a[class='business-name']")
links_filtered=[]
for l in links:
try:
l["data-analytics"]
links_filtered.append(l)
except:
continue
links_list = []
for link in links_filtered:
links_list.append(f"https://www.yellowpages.com{link['href']}")
return links_list
except Exception as e:
print(e)
return []
def open_file(filename):
try:
if sys.platform == "win32":
os.startfile(filename)
else:
opener = "open" if sys.platform == "darwin" else "xdg-open"
subprocess.call([opener, filename])
except:
return False
def create_csv(elements):
row_list = [["Name", "Address", "Phone", "Email", "Website"]]
for e in elements:
name = e["name"]
address = e["address"]
phone = e["phone"]
email = e["email"]
website = e["website"]
row_list.append([name, address, phone, email, website])
with open('output.csv', 'w', newline='', encoding='utf8') as file:
writer = csv.writer(file)
writer.writerows(row_list)
def clear():
# for windows
if name == 'nt':
_ = system('cls')
# for mac and linux(here, os.name is 'posix')
else:
_ = system('clear')
def main():
clear()
try:
while True:
keyword = input("Keyword: ")
if keyword != "":
break
while True:
city = input("City: ")
if city != "":
break
clear()
scraper = Scraper(keyword, city)
results, num_pages = scraper.get_num_pages()
if not results:
print("0 results found")
return False
print(f"{results} results found {keyword} - {city}")
data = []
pages = tqdm(range(1, num_pages + 1))
for page in pages:
clear()
try:
pages.set_description(f"Scraping page {page}/{num_pages}...")
links = scraper.get_links(page)
if not (len(links) > 0):
continue
links = tqdm(links)
for link in links:
try:
links.set_description(f"Scraping {link}")
info = scraper.get_info(link)
# print(info)
data.append(info)
create_csv(data)
except:
continue
except:
continue
print("Opening file...")
open_file("output.csv")
print("Task finished")
except:
return False
if __name__ == "__main__":
main()

It fails on the line
num_results = int(re.search('We found(.*?)results',pagination.text).group(1))
A very simple check of the search results page, by opening the browser, would have shown you that the text "We found x results" is not present on the page. So re.search returns None, even if there are many results.
Adjust your script to work without num_pages and only paginate via the page links at the bottom or by incrementing the page= parameter in the URL until no more results/pages are listed.
FYI, next time, put in some minimal debugging effort and not post your entire script.

Related

Indeed Job Scraper with Python - only returning url links, no title/description

I know you've probably seen 100 Indeed Scraping posts on here, and i'm hoping mine is a bit different. Essentially, I'm trying to build an Indeed job scraper that pulls company name and job title, based on a search with "job title" and "location" being variables. Additionally, when selenium opens chrome, Indeed is auto-populating my location, which doesn't get overwritten by the location I've inputting in the code.
I'm fairly new to Python, and I'm relying on the foundation built by someone else from GitHub, so I am having trouble diagnosing the problem.
Would love any help or insight!
Here is my code:
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import json
from time import sleep
list_of_description = ["warehouse","associate"]
URL = "https://www.indeed.com/"
MAIN_WINDOW_HANDLER = 0
JOB_TITLE = " "
JOB_LOCATION = " "
JSON_DICT_ARRAY = []
def main():
pageCounter = 0
bool_next = True
newUrl = ""
# theUrl = "https://ca.indeed.com/jobs?q=developer&l=Winnipeg%2C+MB"
browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
browser.get( URL )
# Change text in where
whatElement = browser.find_element(By.ID,"text-input-what")
whatElement.send_keys( JOB_TITLE )
# Change text in where
whereElement = browser.find_element(By.ID,"text-input-where")
whereElement.send_keys(Keys.CONTROL + "a")
whereElement.send_keys(Keys.BACK_SPACE)
whereElement.send_keys( JOB_LOCATION )
whereElement.submit()
MAIN_WINDOW_HANDLER = browser.window_handles[0]
fileName = "{} Jobs in {}.json".format(JOB_TITLE, JOB_LOCATION)
newPage = True
nextNumber = 2
searchPhrase = '//span[contains(text(), "{0}") and #class="pn"]'.format(nextNumber)
currentHTML = browser.page_source
linkElements = getElementFromHTML('div .title', currentHTML) # searching for div tags with title class
reqResultText = currentHTML #(download_file(URL)).text
browser.get( browser.current_url )
browser.get( browser.current_url )
scrapeJobListing(linkElements, reqResultText, browser, MAIN_WINDOW_HANDLER)
if( check_exists_by_xpath(browser, '//button[#id="onetrust-accept-btn-handler"]') ):
try:
theElement = browser.find_element(By.XPATH, '//button[#id="onetrust-accept-btn-handler"]' )
print(type(theElement))
theElement.click()
print("I clicked")
# scrapeJobListing(linkElements, reqResultText, browser, MAIN_WINDOW_HANDLER)
while ( newPage and check_exists_by_xpath(browser, searchPhrase) ):
theElement = browser.find_elements(By.XPATH, searchPhrase )
try:
theElement[0].click()
except:
newPage = False
if(newPage):
browser.get(browser.current_url)
print(browser.current_url)
nextNumber += 1
searchPhrase = '//span[contains(text(), "{0}") and #class="pn"]'.format(nextNumber)
currentHTML = browser.page_source
linkElements = getElementFromHTML('div .title', currentHTML) # searching for div tags with title class
reqResultText = currentHTML #(download_file(URL)).text
scrapeJobListing(linkElements, reqResultText, browser, MAIN_WINDOW_HANDLER)
else:
print ("Search Concluded")
except:
# scrapeJobListing(linkElements, reqResultText, browser, MAIN_WINDOW_HANDLER)
while ( newPage and check_exists_by_xpath(browser, searchPhrase) ):
theElement = browser.find_elements(By.XPATH, searchPhrase )
try:
theElement[0].click()
except:
newPage = False
if(newPage):
browser.get(browser.current_url)
print(browser.current_url)
nextNumber += 1
searchPhrase = '//span[contains(text(), "{0}") and #class="pn"]'.format(nextNumber)
currentHTML = browser.page_source
linkElements = getElementFromHTML('div .title', currentHTML) # searching for div tags with title class
reqResultText = currentHTML #(download_file(URL)).text
scrapeJobListing(linkElements, reqResultText, browser, MAIN_WINDOW_HANDLER)
else:
print ("Search Concluded")
with open(fileName, "w") as data:
for it in JSON_DICT_ARRAY:
data.write(json.dumps(it))
data.write(",\n")
data.close()
def scrapeJobListing(linkElements, reqResultText, browser, mainHandler):
jobDes = ""
for i in range( len(linkElements) ):
print("\n ",i)
jsonDataDict = {}
list = re.findall(r'["](.*?)["]',str(linkElements[i]))
currJobMap = "jobmap[{}]= ".format(i)
openBracketIndex = reqResultText.find(currJobMap) + len(currJobMap)
findNewString = reqResultText[openBracketIndex:openBracketIndex+600]
print (findNewString)
closeBracketIndex = findNewString.find("}") + 1
cmpOpen = findNewString.find("cmp:'") + len("cmp:'")
cmpClose = findNewString.find("',cmpesc:")
titleOpen = findNewString.find("title:'") + len("title:'")
titleClose = findNewString.find("',locid:")
parsedString = str( findNewString[0:closeBracketIndex] )
print (parsedString)
print("\n")
cmpName = parsedString[cmpOpen:cmpClose]# Company Name
jobTitle = parsedString[titleOpen:titleClose]# Job Title
jsonDataDict['(2) Company Name'] = cmpName
jsonDataDict['(1) Job Title'] = jobTitle
try:
title = browser.find_element(By.ID,list[4]) # 4th quotation is the Job Description
print('Found <%s> element with that class name!' % (title.tag_name))
title.click()
window_after = browser.window_handles[1]
browser.switch_to.window(window_after)
theCurrURL = browser.current_url
browser.get(theCurrURL)
currPageSource = browser.page_source
jsonDataDict['(4) Job Link'] = theCurrURL
print (theCurrURL)
jobDes = getElementFromHTML('div #jobDescriptionText', currPageSource)
soup = bs4.BeautifulSoup(str(jobDes), "html.parser")
jobDescText = soup.get_text('\n')
jsonDataDict['(3) Job Description'] = jobDescText
JSON_DICT_ARRAY.append(jsonDataDict)
browser.close()
print(jobDes)
except:
print('Was not able to find an element with that name.')
# sleep(2)
print (mainHandler)
browser.switch_to.window(mainHandler) #Not necessary right?
def getElementBySearch(searchTag, theURL):
reqResult = download_file(theURL)
soup = bs4.BeautifulSoup(reqResult.text, "html.parser")
element = soup.select(searchTag)
return element
def getElementFromHTML(searchTag, htmlText):
soup = bs4.BeautifulSoup(htmlText, "html.parser")
element = soup.select(searchTag)
return element
def check_exists_by_xpath(webdriver, xpath):
try:
webdriver.find_elements(By.XPATH,xpath)
except NoSuchElementException:
return False
return True
def download_file(searchPhrase):
result = requests.get(searchPhrase)
# type(result)
# Check for error
try:
result.raise_for_status()
except Exception as exc:
print('There was a problem: %s' % (exc))
return result
if __name__== "__main__":
main()
Right now, the script essentially opens Indeed, looks through each page and posts the links. But I'm not sure why it's not providing the job title and company information.
The output looks like this -
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=10&vjk=cbe41d08db5e3eaa
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=20&vjk=cbe41d08db5e3eaa
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=30&vjk=cbe41d08db5e3eaa
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=40&vjk=2fd38d5eb42b6ca4
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=50&vjk=acbe5c6e5afce1d6
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=60&vjk=acbe5c6e5afce1d6
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=70&vjk=acbe5c6e5afce1d6
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=80&vjk=acbe5c6e5afce1d6
CDwindow-07A1095B627FCB670750FBBDC552B60D
Search Concluded

Python web scraping empty result

I followed a youtube tutorial on web scraping to scrape this website https://books.toscrape.com/ but i'm getting an empty result
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
all_books = []
url = "http://books.toscrape.com/catalogue/page-1.html"
def get_page(url):
page = requests.get(url)
status = page.status_code
soup = bs(page.text, "lxml")
return [soup, status]
def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def extract_info(links):
for listing in listings:
bk_lnk = listing.find("h5").a.get("href")
base_url = "http://books.toscrape.com/catalogue"
cmplt_lnk = base_url + bk_lnk
links.append(cmplt_lnk)
return links
def extract_info(links):
for link in links:
res = requests.get(link).text
book_soup = bs(res, "lxml")
title = book_soup.find(class_ = "col-sm-6 product_main").h1. text.strip()
price = book_soup.find(class_ = "col-sm-6 product_main").p. text.strip()
book = {"title": title, "price": price}
all_books.append(book)
pg = 1
while True:
url = f"http://books.toscrape.com/catalogue/page-{pg}.html"
soup_status = get_page(url)
if soup_status[1] == 200:
print (f"scraping page {pg}")
extract_info(get_links(soup_status[0]))
pg += 1
else:
print("The End")
break
df = pd.DataFrame(all_books)
print (df)
here's the result am getting
Empty DataFrame
Columns: []
Index: []
my colab notebook link
https://colab.research.google.com/drive/1Lyvwt_WLpE9tqy1qheZg80N70CFSsk-E?usp=sharing
def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def extract_links():
for listing in listings:
bk_lnk = listing.find("h3").a.get("href")
base_url = "https://books.toscrape.com/catalogue/"
cmplt_lnk = base_url + bk_lnk
links.append(cmplt_lnk)
return links
return extract_links()
def extract_info(links):
for link in links:
res = requests.get(link).text
book_soup = bs(res, "lxml")
title = book_soup.find(class_ = "col-sm-6 product_main").h1.text.strip()
price = book_soup.find(class_ = "col-sm-6 product_main").p.text.strip()
book = {"title": title, "price": price}
all_books.append(book)
pg = 45
while True:
url = f"https://books.toscrape.com/catalogue/page-{pg}.html"
soup_status = get_page(url)
if soup_status[1] == 200:
print (f"scraping page {pg}")
extract_info(get_links(soup_status[0]))
pg += 1
else:
print("The End")
break
Your list is empty . Need to call your functions .. such as
Get_page(url) which should return a list which you can use soup in your subsequent function ..

How to scrape items having same class and id [duplicate]

This question already has answers here:
How to scrape phone no using python when it show after clicked
(2 answers)
Closed 2 years ago.
I want to scrape person name, location and phone no but all thses have same class and no id.Here is the link of that web page: https://hipages.com.au/connect/emcoelectricalservices Please guide me. than you!
here is mo code:
import requests
from bs4 import BeautifulSoup
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):
try:
title = (soup.find('h1', class_="sc-AykKI",id=False).text)
except:
title = 'Empty Title'
print(title)
try:
contact_person = (soup.find('span', class_="kBpGee",id=False).text)
except:
contact_person = 'Empty Person'
print(contact_person)
try:
location = (soup.find('span', class_="kBpGee",id=False).text)
except:
location = 'Empty location'
print(location)
def main():
#get data of detail page
url = "https://hipages.com.au/connect/emcoelectricalservices"
#get_page(url)
get_detail_data(get_page(url))
if __name__ == '__main__':
main()
Hi the below works:-
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 4 09:52:13 2020
#author: prakh
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):
titles = []
persons = []
locations = []
try:
titles.append(soup.find('h1', class_="sc-AykKI",id=False).text)
except:
titles.append('Empty Title')
try:
persons.append(soup.findAll('span', class_="Contact__Item-sc-1giw2l4-2 kBpGee",id=False)[1].text)
except:
persons.append('Empty Person')
try:
locations.append(soup.findAll('span', class_="Contact__Item-sc-1giw2l4-2 kBpGee",id=False)[2].text)
except:
locations.append('Empty location')
final_df = pd.DataFrame(
{'Title': titles,
'contact_person': persons,
'location': locations
})
print(final_df)
def main():
#get data of detail page
url = "https://hipages.com.au/connect/emcoelectricalservices"
#get_page(url)
get_detail_data(get_page(url))
if __name__ == '__main__':
main()

Python scraping data of multiple pages issue

I'm getting one issue my code scrape everything from only the first page. But I want to scrape data of multiple pages same as from the first page. Actully I also wrote a code for multiple pages and it also move forward to next page but scrape data of first page again. please have a look at my code and gude me how can i fix this issue. thanks!
here is my code:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_page(soup):
try:
title = (soup.find('h1',class_="cdm_style",id=False).text)
except:
title = 'Empty Title'
try:
collection = (soup.find('td',id="metadata_collec").find('a').text)
except:
collection = "Empty Collection"
try:
author = (soup.find('td',id="metadata_creato").text)
except:
author = "Empty Author"
try:
abstract = (soup.find('td',id="metadata_descri").text)
except:
abstract = "Empty Abstract"
try:
keywords = (soup.find('td',id="metadata_keywor").text)
except:
keywords = "Empty Keywords"
try:
publishers = (soup.find('td',id="metadata_publis").text)
except:
publishers = "Empty Publishers"
try:
date_original = (soup.find('td',id="metadata_contri").text)
except:
date_original = "Empty Date original"
try:
date_digital = (soup.find('td',id="metadata_date").text)
except:
date_digital = "Empty Date digital"
try:
formatt = (soup.find('td',id="metadata_source").text)
except:
formatt = "Empty Format"
try:
release_statement = (soup.find('td',id="metadata_rights").text)
except:
release_statement = "Empty Realease Statement"
try:
library = (soup.find('td',id="metadata_librar").text)
except:
library = "Empty Library"
try:
date_created = (soup.find('td',id="metadata_dmcreated").text)
except:
date_created = "Empty date Created"
data = {
'Title' : title.strip(),
'Collection' : collection.strip(),
'Author' : author.strip(),
'Abstract' : abstract.strip(),
'Keywords' : keywords.strip(),
'Publishers' : publishers.strip(),
'Date_original': date_original.strip(),
'Date_digital' : date_digital.strip(),
'Format' : formatt.strip(),
'Release-st' : release_statement.strip(),
'Library' : library.strip(),
'Date_created' : date_created.strip()
}
return data
def get_index_data(soup):
try:
titles_link = soup.find_all('a',class_="body_link_11")
except:
titles_link = []
else:
titles_link_output = []
for link in titles_link:
try:
item_id = link.attrs.get('item_id', None) #All titles with valid links will have an item_id
if item_id:
titles_link_output.append("{}{}".format("http://cgsc.cdmhost.com",link.attrs.get('href', None)))
except:
continue
return titles_link_output
def write_csv(data,url):
with open('1111_to_5555.csv','a') as csvfile:
writer = csv.writer(csvfile)
row = [data['Title'], data['Collection'], data['Author'],
data['Abstract'], data['Keywords'], data['Publishers'], data['Date_original'],
data['Date_digital'], data['Format'], data['Release-st'], data['Library'],
data['Date_created'], url]
writer.writerow(row)
def main():
for x in range(2,4):
mainurl = ("http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/")
print(x)
url = f"{mainurl}{x}"
products = get_index_data(get_page(url))
for product in products:
data1 = get_detail_page(get_page(product))
write_csv(data1,product)
if __name__ == '__main__':
main()
in get_page() function, try to add headers on upon requests
def get_page(url):
headers = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
response = requests.get(url, headers=headers)

"See also" web crawler

I have a problem with writing a Wikipedia's Web Crawler. This crawler needs to display "See also" section for a certain link. What is more, this crawler also has to show "See also" section for every link in the first use of "See also". For example: this Wiki page: https://en.wikipedia.org/wiki/Internet its "See also" section contains for example this page https://en.wikipedia.org/wiki/Crowdfunding, and this Crowdfunding page contains for example this: https://en.wikipedia.org/wiki/Angel_investor
This example is based on single links, but in the "see also" section there are like 10+ links and that is what I need to create. I also have to do it RECURSIVELY. Here's what my draft looks like but it gives me errors and it's not working like it should be (it's not even recursive) :D
#Import Libraries
import time #For Delay
import urllib.request #Extracting web pages
import re
#Defining pages
starting_page = "https://en.wikipedia.org/wiki/Spacetime"
seed_page = "https://en.wikipedia.org" #Crawling the English Wikipedia
#Downloading entire Web Document (Raw Page Content)
def download_page(url):
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = str(resp.read())
return respData
except Exception as e:
print(str(e))
#Extract the "See also" section elements
def extract_see_also(page):
if 'id="See_also">' in page:
start_see_also = page.find('id="See_also">')
start_list_items = page.find('<li>', start_see_also + 1)
end_see_also = page.find('<h2>', start_list_items + 1)
see_also_section = page[start_list_items: end_see_also]
pure_item_raw = (re.sub(r'<.+?>', '', see_also_section)).replace('\n', ',')
pure_item_raw2 = pure_item_raw.replace(',,', ',')
pure_item = pure_item_raw2.replace(',,', ',')
flag = 0
else:
pure_item = "No Related Links"
flag = 1
return pure_item, flag
#Getting all links with the help of 'get_next_links'
def get_all_links(page):
links = []
while True:
link, end_link = get_next_link(page)
if link == "no_links":
break
else:
links.append(link) #Append all the links in the list named 'Links'
#time.sleep(0.1)
page = page[end_link:]
return links
#Crawl Initiation
#Check for file type in URL so crawler does not crawl images and text files
def extension_scan(url):
a = ['.png','.jpg','.jpeg','.gif','.tif','.txt']
j = 0
while j < (len(a)):
if a[j] in url:
#print("There!")
flag2 = 1
break
else:
#print("Not There!")
flag2 = 0
j = j+1
#print(flag2)
return flag2
#URL parsing for incomplete or duplicate URLs
def url_parse(url):
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
url = url #.lower() #Make it lower case
s = urlparse(url) #parse the given url
seed_page_n = seed_page #.lower() #Make it lower case
#t = urlparse(seed_page_n) #parse the seed page (reference page)
i = 0
flag = 0
while i<=9:
if url == "/":
url = seed_page_n
flag = 0
elif not s.scheme:
url = "http://" + url
flag = 0
elif "#" in url:
url = url[:url.find("#")]
flag = 0
elif "?" in url:
url = url[:url.find("?")]
flag = 0
elif s.netloc == "":
url = seed_page + s.path
flag = 0
#elif "www" not in url:
# url = "www."[:7] + url[7:]
# flag = 0
elif url[len(url)-1] == "/":
url = url[:-1]
flag = 0
#elif s.netloc != t.netloc:
# url = url
# flag = 1
# break
else:
url = url
flag = 0
break
i = i+1
s = urlparse(url) #Parse after every loop to update the values of url parameters
return(url, flag)
t0 = time.time()
database = {} #Create a dictionary
#Main Crawl function that calls all the above function and crawls the entire site sequentially
def web_crawl():
to_crawl = [starting_page] #Define list name 'Seed Page'
#print(to_crawl)
crawled=[] #Define list name 'Seed Page'
#database = {} #Create a dictionary
#k = 0;
for k in range(0, 3):
i=0 #Initiate Variable to count No. of Iterations
while i<3: #Continue Looping till the 'to_crawl' list is not empty
urll = to_crawl.pop(0) #If there are elements in to_crawl then pop out the first element
urll,flag = url_parse(urll)
#print(urll)
flag2 = extension_scan(urll)
time.sleep(3)
#If flag = 1, then the URL is outside the seed domain URL
if flag == 1 or flag2 == 1:
pass #Do Nothing
else:
if urll in crawled: #Else check if the URL is already crawled
pass #Do Nothing
else: #If the URL is not already crawled, then crawl it and extract all the links from it
print("Link = " + urll)
raw_html = download_page(urll)
#print(raw_html)
see_also,flag2 = extract_see_also(raw_html)
print("Related Links = " + see_also)
crawled.append(urll)
#Remove duplicated from to_crawl
n = 1
j = 0
#k = 0
while j < (len(to_crawl)-n):
if to_crawl[j] in to_crawl[j+1:(len(to_crawl)-1)]:
to_crawl.pop(j)
n = n+1
else:
pass #Do Nothing
j = j+1
i=i+1
#print(to_crawl)
#print("Iteration No. = " + str(i))
#print("To Crawl = " + str(len(to_crawl)))
#print("Crawled = " + str(len(crawled)))
return ""
print (web_crawl())
t1 = time.time()
total_time = t1-t0

Categories