I need help with a little web scraper I have written and put into my pythonanywhere account to let it run several times per day.
here is my code:
import requests
from bs4 import BeautifulSoup
import time
import random
list_all_results = []
for i in range(1, 3):
time.sleep(random.uniform(1.5, 2))
print("Scraping page " + str(i) + "/745")
try:
URL = "https://www.futbin.com/players?page=" + str(i)
platform = "pc"
cookies = {"platform": platform}
page = requests.get(URL, cookies=cookies)
soup = BeautifulSoup(page.content, "html.parser")
result_names = soup.find_all("a", attrs={"class": "player_name_players_table"})
result_ratings = soup.find_all(
"span",
attrs={"class": lambda r: r.startswith("form rating ut21") if r else False},
)
result_rarity = soup.find_all("td", {"class": "mobile-hide-table-col"})
result_prices_pc = soup.find_all(
"span", attrs={"class": "pc_color font-weight-bold"}
)
list_names = []
list_ratings = []
list_rarities = []
list_prices = []
for name in result_names:
list_names.append(name.text)
for rating in result_ratings:
list_ratings.append(rating.text)
for rarity in result_rarity:
list_rarities.append(rarity.text)
for price in result_prices_pc:
n = price.text.strip()
if "K" in n:
n2 = n.replace("K", "")
full_int = int(float(n2) * 1000)
list_prices.append(full_int)
elif "M" in n:
n2 = n.replace("M", "")
full_int = int(float(n2) * 1000000)
list_prices.append(full_int)
else:
list_prices.append(int(price.text.strip()))
int_list_length = len(list_names)
for i in range(0, int_list_length):
list_all_results.append(
tuple(
(list_names[i], list_ratings[i], list_rarities[i], list_prices[i])
)
)
with open("/home/exec85/scrape/pc.txt", "a") as f: # create new .txt file and write content to file
f.write(f"{list_all_results}")
except:
pass
print("FINISHED")
For some reason I dont get any result printed, so I assume nothing gets scraped and also the .txt file is not created.
Even if I manually create the .txt file it gets not filled.
Running the script on my local machine all works fine.
Your code works well but if you want the code works on PythonAnywhere, you need to have a paid account but you can reach this site list.
Related
trying to build a webscraper to return lists of freelance gig postings on different websites into one place. My code is below and it keeps returning "None". I'm a bit stuck at this point, if you can help identify why it keeps doing this that would be great.
import requests
from bs4 import BeautifulSoup
import pprint
res1 = requests.get('https://www.airtasker.com/tasks/?task_states=posted&lat=-33.7918&lon=151.0806&location_name=Eastwood%2C%20NSW&radius=20000000&carl_ids=&task_types=both&max_price=9999&min_price=5&search_term=python&badges=&sort_by=posted_desc') # this is where we will scrape the info from
soup1 = BeautifulSoup(res1.text, 'html.parser') # this tells BS to give us HTML code for the page
links1 = soup1.select('.new-task-list-item new-task-list-item--open') # link of each gig
subtext1 = soup1.select('.new-task-list-item__date at-icon-calendar') # date of each gig
res2 = requests.get('https://www.airtasker.com/tasks/?task_states=posted&lat=-33.7918&lon=151.0806&location_name=Eastwood%2C%20NSW&radius=20000000&carl_ids=&task_types=both&max_price=9999&min_price=5&search_term=web%20developer&badges=&sort_by=posted_desc')
soup2 = BeautifulSoup(res2.text, 'html.parser')
links2 = soup2.select('.new-task-list-item new-task-list-item--open')
subtext2 = soup2.select('.new-task-list-item__date at-icon-calendar')
res3 = requests.get('https://www.upwork.com/freelance-jobs/website/')
soup3 = BeautifulSoup(res3.text, 'html.parser')
links3 = soup3.select('.job-title')
subtext3 = soup3.select('.text-muted')
res4 = requests.get('https://www.upwork.com/freelance-jobs/data-science/')
soup4 = BeautifulSoup(res4.text, 'html.parser')
links4 = soup4.select('.job-title')
subtext4 = soup4.select('.text-muted')
res5 = requests.get('https://www.upwork.com/freelance-jobs/bot-development/')
soup5 = BeautifulSoup(res5.text, 'html.parser')
links5 = soup5.select('.job-title')
subtext5 = soup5.select('.text-muted')
res6 = requests.get('https://www.upwork.com/freelance-jobs/python-script/')
soup6 = BeautifulSoup(res6.text, 'html.parser')
links6 = soup6.select('.job-title')
subtext6 = soup6.select('.text-muted')
mega_links = links1 + links2 + links3 + links4 + links5 + links6
mega_subtext = subtext1 + subtext2 + subtext3 + subtext4 + subtext5 + subtext6
def extract(links, subtexts):
joblist = []
for indx, item in enumerate(links):
title = item.getText()
href = item.get('href')
joblist.append({'title': title, 'link': href})
return joblist
pprint.pprint(extract(mega_links , mega_subtext))
I have no idea what exactly you are trying to extract from the scraped web page requests. Here's what I tried from my end:
Your links variable are null or empty lists since there is no such querySelector present for the web page you're trying to scrape. For example, the console of the first web page that you are scraping (the element you're trying to scrape doesn't exist):
I would recommend you to confirm the element you're trying to scrape and confirm it's class.
Another Point of Consideration:
When you will print your soup variables you will notice that you get CloudFare as the output.
class Crawler():
def __init__(self):
self.pag = 1
i = 0
def get_urls(self,main_url):
self.url = 'https://www.test.ro/search/'+ main_url +'/p1'
self.filename = main_url
r = requests.get(self.url)
soup = BeautifulSoup(r.text, 'html.parser')
number_pages = soup.find(class_= 'row' )
last_page = number_pages.find_all('a')[len(number_pages.find_all('a'))-2].get("data-page")
for i in range(1, int(last_page)+1):
url.append('https://www.test.ro/search/'+ main_url +'/p' + str(i))
def print_urls(self):
for urls in url:
print (urls)
def scrape(self,url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
product_list = soup.find(class_ = 'page-container')
product_list_name = product_list.find_all('h2')
product_list_oldprice = product_list.find_all(class_ = 'product-old-price')
product_list_newprice = product_list.find_all(class_ = 'product-new-price')
for i in range(0, len(product_list_name)):
name = product_list_name[i].get_text().strip()
link = product_list_name[i].find('a').get('href')
#print(name)
#print(len(name))
try:
price = product_list_oldprice[i].contents[0].get_text()
price = price[:-6]
#print(price)
except IndexError:
#print("no old price")
#print(product_list_newprice[i].contents[0])
with open(self.filename+'.csv', 'a', encoding = 'utf-8', newline='') as csv_file:
file_is_empty = os.stat(self.filename+'.csv').st_size == 0
fieldname = ['name','link', 'price_old', 'price_actualy']
writer = csv.DictWriter(csv_file, fieldnames = fieldname)
if file_is_empty:
writer.writeheader()
writer.writerow({'name':name,'link':link, 'price_old':price, 'price_actualy':product_list_newprice[i].contents[0]})
if __name__=='__main__':
print("Search for product: ")
urlsearch = input()
starttime = time.time()
scraper = Crawler()
scraper.get_urls(urlsearch)
scraper.print_urls()
#scraper.scrape(url[0])
pool = multiprocessing.Pool()
pool.map(scraper.scrape,url)
pool.close()
print('That took {} seconds'.format(time.time() - starttime))
So I have this scraper, it works perfectly on any website bag but only on the product page.
I did it for a specific website, but how could I go on each page to take the data from the product and give it back and do it all over again?
Is such a thing possible?
I now take the data from the products page, ie name, link, price.
You have divs there too.
Can I help href?
In this case you need to create a category scraper that safes all product urls first. Scrape all urls and go through all the category's and for example safe them to csv first (the product urls). Then you can take all the product urls from the CSV and loop through all of them.
I have a script that loops through multiple pages of a website and I want to skip over or add a blank space for the item that might not be on certain pages. For example, there are some pages that do not contain a license. When I run into one of those pages I get an attribute error. My script below loops through the first two pages with no problem, but when it hits the third page it stops. How can I fix this? Here is my script:
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
import json
base_url = "https://open.umn.edu/opentextbooks/"
data = []
n = 50
for i in range(4, n+1):
response = urlopen(base_url + "BookDetail.aspx?bookId=" + str(i))
page_html = response.read()
response.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grabs info for each textbook
containers = page_soup.findAll("div",{"class":"LongDescription"})
author = page_soup.select("p")
for container in containers:
item = {}
item['type'] = "Textbook"
item['title'] = container.find("div",{"class":"twothird"}).h1.text
item['author'] = author[3].get_text(separator=', ')
if item['author'] == " ":
item['author'] = "University of Minnesota Libraries Publishing"
item['link'] = "https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=" + str(i)
item['source'] = "Open Textbook Library"
item['base_url'] = "https://open.umn.edu/opentextbooks/"
item['license'] = container.find("p",{"class":"Badge-Condition"}).a.text
if item['license'] != container.find("p",{"class":"Badge-Condition"}).a.text:
item['license'] = ""
item['license_url'] = container.find("p",{"class":"Badge-Condition"}).a["href"]
data.append(item) # add the item to the list
with open("./json/noSubject/otl-loop.json", "w") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)
I figured it out. My main issue was with item['license'] Here is my fix:
if container.find("p",{"class":"Badge-Condition"}).a:
item['license'] = container.find("p",{"class":"Badge-Condition"}).a.text
if container.find("img",{"class":"ctl00_maincontent_imgLicence"}):
item['license'] = ''
if container.find("p",{"class":"Badge-Condition"}).a:
item['license_url'] = container.find("p",{"class":"Badge-Condition"}).a["href"]
if container.find("img",{"class":"ctl00_maincontent_imgLicence"}):
item['license_url'] = ''
I am trying to make a sitemap for the following website:
http://aogweb.state.ak.us/WebLink/0/fol/12497/Row1.aspx
The code goes through and first determines how many pages are on the top directory level, then it stores the each page number and its corresponding link. Then it goes through each page and creates a dictionary that contains each 3 digit file value and the corresponding link for that value. From there the code takes creates another dictionary of the pages and links for each 3 digit directory (this is the point at which I am stuck). Once this is complete the goal is to create a dictionary that contains each 6 digit file number and its corresponding link.
However, the code randomly fails at certain points throughout the scraping process and gives the following error message:
Traceback (most recent call last):
File "C:\Scraping_Test.py", line 76, in <module>
totalPages = totalPages.text
AttributeError: 'NoneType' object has no attribute 'text'
Sometimes the code does not even run and automatically skips to the end of the program without any errors.
I am currently running python 3.6.0 and using all updated libraries on Visual Studio Community 2015. Any help will be appreciated as I am new to programming.
import bs4 as bs
import requests
import re
import time
def stop():
print('sleep 5 sec')
time.sleep(5)
url0 = 'http://aogweb.state.ak.us'
url1 = 'http://aogweb.state.ak.us/WebLink/'
r = requests.get('http://aogweb.state.ak.us/WebLink/0/fol/12497/Row1.aspx')
soup = bs.BeautifulSoup(r.content, 'lxml')
print('Status: ' + str(r.status_code))
stop()
pagesTopDic = {}
pagesTopDic['1'] = '/WebLink/0/fol/12497/Row1.aspx'
dig3Dic = {}
for link in soup.find_all('a'): #find top pages
if not link.get('title') is None:
if 'page' in link.get('title').lower():
page = link.get('title')
page = page.split(' ')[1]
#print(page)
pagesTopDic[page] = link.get('href')
listKeys = pagesTopDic.keys()
for page in listKeys: #on each page find urls for beggining 3 digits
url = url0 + pagesTopDic[page]
r = requests.get(url)
soup = bs.BeautifulSoup(r.content, 'lxml')
print('Status: ' + str(r.status_code))
stop()
for link in soup.find_all('a'):
if not link.get("aria-label") is None:
folder = link.get("aria-label")
folder = folder.split(' ')[0]
dig3Dic[folder] = link.get('href')
listKeys = dig3Dic.keys()
pages3Dic = {}
for entry in listKeys: #pages for each three digit num
print(entry)
url = url1 + dig3Dic[entry]
r = requests.get(url)
soup = bs.BeautifulSoup(r.content, 'lxml')
print('Status: ' + str(r.status_code))
stop()
tmpDic = {}
tmpDic['1'] = '/Weblink/' + dig3Dic[entry]
totalPages = soup.find('div',{"class": "PageXofY"})
print(totalPages)
totalPages = totalPages.text
print(totalPages)
totalPages = totalPages.split(' ')[3]
print(totalPages)
while len(tmpDic.keys()) < int(totalPages):
r = requests.get(url)
soup = bs.BeautifulSoup(r.content, 'lxml')
print('Status: ' + str(r.status_code))
stop()
for link in soup.find_all('a'): #find top pages
if not link.get('title') is None:
#print(link.get('title'))
if 'Page' in link.get('title'):
page = link.get('title')
page = page.split(' ')[1]
tmpDic[page] = link.get('href')
num = len(tmpDic.keys())
url = url0 + tmpDic[str(num)]
print()
pages3Dic[entry] = tmpDic
I am screen scraping data using a web crawler and storing the results - (tweets from a twitter page) as separate html files for each user I'm crawling. I intend to later parse the html files and store the data into a database for analysis. However, I am having a bizarre problem.
When I run the following program - a small snippet from the overall crawler - I am able to get a separate html file for each follower:
import re
import urllib2
import twitter
start_follower = "NYTimesKrugman"
depth = 3
searched = set()
api = twitter.Api()
def crawl(follower, in_depth):
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest1\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
crawl(name, in_depth-1)
crawl(start_follower, depth)
for x in searched:
print x
print "Program is completed."
However, when I run the full crawler, I do not get a separate file for each follower:
import twitter
import urllib
from BeautifulSoup import BeautifulSoup
import re
import time
start_follower = "NYTimeskrugman"
depth = 2
searched = set()
api = twitter.Api()
def add_to_U(user):
U.append(user)
def site(follower): #creates a twitter site url in string format based on the follower username
followersite = "http://mobile.twitter.com/" + follower
return followersite
def getPage(follower): #obtains access to a webapge
url = site(follower)
response = urllib.urlopen(url)
return response
def getSoup(response): #creates the parsing module
html = response.read()
soup = BeautifulSoup(html)
return soup
def gettweets(soup, output):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
a = tag.renderContents()
b = str (a)
output.write(b)
output.write('\n\n')
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more') != -1:
return True
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def crawl(follower, in_depth): #main method of sorts
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest2\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
a = getPage(follower)
soup = getSoup(a)
gettweets(soup, output)
tweets = are_more_tweets(soup)
while(tweets):
b = getnewlink(soup)
red = urllib.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup, output)
tweets = are_more_tweets(soup)
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
print name
crawl(name, in_depth - 1)
crawl(start_follower, depth)
print("Program done. Look at output file.")
More specifically, I seem to get a separate html file for about the first five followers and then no new files appear to be created. Any help would be appreciated!
The depth value is different between the snippet and the full code (you're only going to get one level of recursion in the full code). Also, you only grab the first five names from the followers list: for name in list(names)[0:5]: So you get six people total: the starting follower and their first five friends.