Python Web Scraping Randomly Failing - python

I am trying to make a sitemap for the following website:
http://aogweb.state.ak.us/WebLink/0/fol/12497/Row1.aspx
The code goes through and first determines how many pages are on the top directory level, then it stores the each page number and its corresponding link. Then it goes through each page and creates a dictionary that contains each 3 digit file value and the corresponding link for that value. From there the code takes creates another dictionary of the pages and links for each 3 digit directory (this is the point at which I am stuck). Once this is complete the goal is to create a dictionary that contains each 6 digit file number and its corresponding link.
However, the code randomly fails at certain points throughout the scraping process and gives the following error message:
Traceback (most recent call last):
File "C:\Scraping_Test.py", line 76, in <module>
totalPages = totalPages.text
AttributeError: 'NoneType' object has no attribute 'text'
Sometimes the code does not even run and automatically skips to the end of the program without any errors.
I am currently running python 3.6.0 and using all updated libraries on Visual Studio Community 2015. Any help will be appreciated as I am new to programming.
import bs4 as bs
import requests
import re
import time
def stop():
print('sleep 5 sec')
time.sleep(5)
url0 = 'http://aogweb.state.ak.us'
url1 = 'http://aogweb.state.ak.us/WebLink/'
r = requests.get('http://aogweb.state.ak.us/WebLink/0/fol/12497/Row1.aspx')
soup = bs.BeautifulSoup(r.content, 'lxml')
print('Status: ' + str(r.status_code))
stop()
pagesTopDic = {}
pagesTopDic['1'] = '/WebLink/0/fol/12497/Row1.aspx'
dig3Dic = {}
for link in soup.find_all('a'): #find top pages
if not link.get('title') is None:
if 'page' in link.get('title').lower():
page = link.get('title')
page = page.split(' ')[1]
#print(page)
pagesTopDic[page] = link.get('href')
listKeys = pagesTopDic.keys()
for page in listKeys: #on each page find urls for beggining 3 digits
url = url0 + pagesTopDic[page]
r = requests.get(url)
soup = bs.BeautifulSoup(r.content, 'lxml')
print('Status: ' + str(r.status_code))
stop()
for link in soup.find_all('a'):
if not link.get("aria-label") is None:
folder = link.get("aria-label")
folder = folder.split(' ')[0]
dig3Dic[folder] = link.get('href')
listKeys = dig3Dic.keys()
pages3Dic = {}
for entry in listKeys: #pages for each three digit num
print(entry)
url = url1 + dig3Dic[entry]
r = requests.get(url)
soup = bs.BeautifulSoup(r.content, 'lxml')
print('Status: ' + str(r.status_code))
stop()
tmpDic = {}
tmpDic['1'] = '/Weblink/' + dig3Dic[entry]
totalPages = soup.find('div',{"class": "PageXofY"})
print(totalPages)
totalPages = totalPages.text
print(totalPages)
totalPages = totalPages.split(' ')[3]
print(totalPages)
while len(tmpDic.keys()) < int(totalPages):
r = requests.get(url)
soup = bs.BeautifulSoup(r.content, 'lxml')
print('Status: ' + str(r.status_code))
stop()
for link in soup.find_all('a'): #find top pages
if not link.get('title') is None:
#print(link.get('title'))
if 'Page' in link.get('title'):
page = link.get('title')
page = page.split(' ')[1]
tmpDic[page] = link.get('href')
num = len(tmpDic.keys())
url = url0 + tmpDic[str(num)]
print()
pages3Dic[entry] = tmpDic

Related

Python / BeautifulSoup webscraper returning "None"

trying to build a webscraper to return lists of freelance gig postings on different websites into one place. My code is below and it keeps returning "None". I'm a bit stuck at this point, if you can help identify why it keeps doing this that would be great.
import requests
from bs4 import BeautifulSoup
import pprint
res1 = requests.get('https://www.airtasker.com/tasks/?task_states=posted&lat=-33.7918&lon=151.0806&location_name=Eastwood%2C%20NSW&radius=20000000&carl_ids=&task_types=both&max_price=9999&min_price=5&search_term=python&badges=&sort_by=posted_desc') # this is where we will scrape the info from
soup1 = BeautifulSoup(res1.text, 'html.parser') # this tells BS to give us HTML code for the page
links1 = soup1.select('.new-task-list-item new-task-list-item--open') # link of each gig
subtext1 = soup1.select('.new-task-list-item__date at-icon-calendar') # date of each gig
res2 = requests.get('https://www.airtasker.com/tasks/?task_states=posted&lat=-33.7918&lon=151.0806&location_name=Eastwood%2C%20NSW&radius=20000000&carl_ids=&task_types=both&max_price=9999&min_price=5&search_term=web%20developer&badges=&sort_by=posted_desc')
soup2 = BeautifulSoup(res2.text, 'html.parser')
links2 = soup2.select('.new-task-list-item new-task-list-item--open')
subtext2 = soup2.select('.new-task-list-item__date at-icon-calendar')
res3 = requests.get('https://www.upwork.com/freelance-jobs/website/')
soup3 = BeautifulSoup(res3.text, 'html.parser')
links3 = soup3.select('.job-title')
subtext3 = soup3.select('.text-muted')
res4 = requests.get('https://www.upwork.com/freelance-jobs/data-science/')
soup4 = BeautifulSoup(res4.text, 'html.parser')
links4 = soup4.select('.job-title')
subtext4 = soup4.select('.text-muted')
res5 = requests.get('https://www.upwork.com/freelance-jobs/bot-development/')
soup5 = BeautifulSoup(res5.text, 'html.parser')
links5 = soup5.select('.job-title')
subtext5 = soup5.select('.text-muted')
res6 = requests.get('https://www.upwork.com/freelance-jobs/python-script/')
soup6 = BeautifulSoup(res6.text, 'html.parser')
links6 = soup6.select('.job-title')
subtext6 = soup6.select('.text-muted')
mega_links = links1 + links2 + links3 + links4 + links5 + links6
mega_subtext = subtext1 + subtext2 + subtext3 + subtext4 + subtext5 + subtext6
def extract(links, subtexts):
joblist = []
for indx, item in enumerate(links):
title = item.getText()
href = item.get('href')
joblist.append({'title': title, 'link': href})
return joblist
pprint.pprint(extract(mega_links , mega_subtext))
I have no idea what exactly you are trying to extract from the scraped web page requests. Here's what I tried from my end:
Your links variable are null or empty lists since there is no such querySelector present for the web page you're trying to scrape. For example, the console of the first web page that you are scraping (the element you're trying to scrape doesn't exist):
I would recommend you to confirm the element you're trying to scrape and confirm it's class.
Another Point of Consideration:
When you will print your soup variables you will notice that you get CloudFare as the output.

web scraping in pythonanywhere not working

I need help with a little web scraper I have written and put into my pythonanywhere account to let it run several times per day.
here is my code:
import requests
from bs4 import BeautifulSoup
import time
import random
list_all_results = []
for i in range(1, 3):
time.sleep(random.uniform(1.5, 2))
print("Scraping page " + str(i) + "/745")
try:
URL = "https://www.futbin.com/players?page=" + str(i)
platform = "pc"
cookies = {"platform": platform}
page = requests.get(URL, cookies=cookies)
soup = BeautifulSoup(page.content, "html.parser")
result_names = soup.find_all("a", attrs={"class": "player_name_players_table"})
result_ratings = soup.find_all(
"span",
attrs={"class": lambda r: r.startswith("form rating ut21") if r else False},
)
result_rarity = soup.find_all("td", {"class": "mobile-hide-table-col"})
result_prices_pc = soup.find_all(
"span", attrs={"class": "pc_color font-weight-bold"}
)
list_names = []
list_ratings = []
list_rarities = []
list_prices = []
for name in result_names:
list_names.append(name.text)
for rating in result_ratings:
list_ratings.append(rating.text)
for rarity in result_rarity:
list_rarities.append(rarity.text)
for price in result_prices_pc:
n = price.text.strip()
if "K" in n:
n2 = n.replace("K", "")
full_int = int(float(n2) * 1000)
list_prices.append(full_int)
elif "M" in n:
n2 = n.replace("M", "")
full_int = int(float(n2) * 1000000)
list_prices.append(full_int)
else:
list_prices.append(int(price.text.strip()))
int_list_length = len(list_names)
for i in range(0, int_list_length):
list_all_results.append(
tuple(
(list_names[i], list_ratings[i], list_rarities[i], list_prices[i])
)
)
with open("/home/exec85/scrape/pc.txt", "a") as f: # create new .txt file and write content to file
f.write(f"{list_all_results}")
except:
pass
print("FINISHED")
For some reason I dont get any result printed, so I assume nothing gets scraped and also the .txt file is not created.
Even if I manually create the .txt file it gets not filled.
Running the script on my local machine all works fine.
Your code works well but if you want the code works on PythonAnywhere, you need to have a paid account but you can reach this site list.

How do I only print the links that contain certain text in the description on that page?

Im trying to open the links that contain certain words on that page. And if the words are present on that page such as "engineering" then return the link if not pass.
here is what I have so far: The inputs I put are engineering and location is north york
import requests
from bs4 import BeautifulSoup
import webbrowser
import time
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
base_url = 'https://ca.indeed.com/'
r = requests.get(url)
rcontent = r.content
prettify = BeautifulSoup(rcontent, "html.parser")
filter_words = ['chemical engineering', 'instrumentation', 'QA']
all_job_url = []
filtered_job_links = []
http_flinks = []
flinks = []
def get_all_joblinks(): # obtains all the links on the search page
for tag in prettify.find_all('a', {'data-tn-element':"jobTitle"}):
link = tag['href']
all_job_url.append(link)
def filter_links():
for eachurl in all_job_url: # iterates through each link
rurl = requests.get(base_url + eachurl)
content = rurl.content
soup = BeautifulSoup(content, "html.parser")
summary = soup.get_text()
#supposed to filter links based on certain words within text on link page
if any(word in summary for word in filter_words):
for filtered_link in soup.find_all('link', {'rel':'canonical'}):
flink = filtered_link['href'] # obtains only filtered links
if "http:" in flink:
http_flinks.append(flink)
print(http_flinks)
else:
flinks.append(flink)
#website = webbrowser.open_new(base_url + flink)
time.sleep(3)
print(flinks)
else:
print("nothing")
pass
def search_job():
while True:
if prettify.select('div.no_results'):
print("no job matches found")
break
else:
# opens the web page of job search if entries are found
website = webbrowser.open_new(url)
break
get_all_joblinks()
filter_links()

Python BeautifulSoup webcrawling: Formatting output

The site I am trying to crawl is http://www.boxofficemojo.com/yearly/chart/?yr=2013&p=.htm. The specific page I'm focusing on now is http://www.boxofficemojo.com/movies/?id=catchingfire.htm.
From this page, I am having trouble with two things. The first thing is the "Foreign Gross" amount (under Total lifetime Grosses). I got the amount with this function:
def getForeign(item_url):
response = requests.get(item_url)
soup = BeautifulSoup(response.content)
print soup.find(text="Foreign:").find_parent("td").find_next_sibling("td").get_text(strip = True)
The problem is, I can print this amount out to the console, but I can't append these values to a list or write them to a csv file. For the previous data I needed to get on this site, I got the individual piece of information for each movie and appended them all to one list, which I then exported to the csv file.
How can I get this "Foreign Gross" amount as a separate amount for each movie? What do I need to change?
The second problem is related to getting a list of the actors/actresses for each movie. I have this function:
def getActors(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
tempActors = []
print soup.find(text="Actors:").find_parent("tr").text[7:]
This prints out a list of actors: Jennifer LawrenceJosh HutchersonLiam HemsworthElizabeth BanksStanley TucciWoody HarrelsonPhilip Seymour HoffmanJeffrey WrightJena MaloneAmanda PlummerSam ClaflinDonald SutherlandLenny Kravitz
- as so.
I am also having the same problem as I am having with the foreign gross amount. I want to get each individual actor seperately, then append them all to a temporary list, and then later append that list to another full list of all the movies. I did this with the list of directors, but since all the directors are links, but not all of the actors/actresses have html links, I can't do the same. Another issue right now is that there is no space between each of the actors.
Why are my current functions not working, and how can I fix them?
More Code::
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.boxofficemojo.com/yearly/chart/?page=' + str(page) + '&view=releasedate&view2=domestic&yr=2013&p=.htm'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.select('td > b > font > a[href^=/movies/?]'):
href = 'http://www.boxofficemojo.com' + link.get('href')
details(href)
listOfForeign.append(getForeign(href))
listOfDirectors.append(getDirectors(href))
str(listOfDirectors).replace('[','').replace(']','')
getActors(href)
title = link.string
listOfTitles.append(title)
page
listOfForeign = []
def getForeign(item_url):
s = urlopen(item_url).read()
soup = BeautifulSoup(s)
return soup.find(text="Foreign:").find_parent("td").find_next_sibling("td").get_text(strip = True)
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.boxofficemojo.com/yearly/chart/?page=' + str(page) + '&view=releasedate&view2=domestic&yr=2013&p=.htm'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.select('td > b > font > a[href^=/movies/?]'):
href = 'http://www.boxofficemojo.com' + link.get('href')
listOfForeign.append(getForeign(href))
page += 1
print listOfForeign
returns
Traceback (most recent call last):
File "C:/Users/younjin/PycharmProjects/untitled/movies.py", line 75, in
spider(1)
File "C:/Users/younjin/PycharmProjects/untitled/movies.py", line 29, in spider
listOfForeign.append(getForeign(href))
File "C:/Users/younjin/PycharmProjects/untitled/movies.py", line 73, in getForeign
return soup.find(text="Foreign:").find_parent("td").find_next_sibling("td").get_text(strip = True)
AttributeError: 'NoneType' object has no attribute 'find_parent'

File Storage Problem with Python Web Crawler

I am screen scraping data using a web crawler and storing the results - (tweets from a twitter page) as separate html files for each user I'm crawling. I intend to later parse the html files and store the data into a database for analysis. However, I am having a bizarre problem.
When I run the following program - a small snippet from the overall crawler - I am able to get a separate html file for each follower:
import re
import urllib2
import twitter
start_follower = "NYTimesKrugman"
depth = 3
searched = set()
api = twitter.Api()
def crawl(follower, in_depth):
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest1\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
crawl(name, in_depth-1)
crawl(start_follower, depth)
for x in searched:
print x
print "Program is completed."
However, when I run the full crawler, I do not get a separate file for each follower:
import twitter
import urllib
from BeautifulSoup import BeautifulSoup
import re
import time
start_follower = "NYTimeskrugman"
depth = 2
searched = set()
api = twitter.Api()
def add_to_U(user):
U.append(user)
def site(follower): #creates a twitter site url in string format based on the follower username
followersite = "http://mobile.twitter.com/" + follower
return followersite
def getPage(follower): #obtains access to a webapge
url = site(follower)
response = urllib.urlopen(url)
return response
def getSoup(response): #creates the parsing module
html = response.read()
soup = BeautifulSoup(html)
return soup
def gettweets(soup, output):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
a = tag.renderContents()
b = str (a)
output.write(b)
output.write('\n\n')
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more') != -1:
return True
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def crawl(follower, in_depth): #main method of sorts
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest2\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
a = getPage(follower)
soup = getSoup(a)
gettweets(soup, output)
tweets = are_more_tweets(soup)
while(tweets):
b = getnewlink(soup)
red = urllib.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup, output)
tweets = are_more_tweets(soup)
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
print name
crawl(name, in_depth - 1)
crawl(start_follower, depth)
print("Program done. Look at output file.")
More specifically, I seem to get a separate html file for about the first five followers and then no new files appear to be created. Any help would be appreciated!
The depth value is different between the snippet and the full code (you're only going to get one level of recursion in the full code). Also, you only grab the first five names from the followers list: for name in list(names)[0:5]: So you get six people total: the starting follower and their first five friends.

Categories