Python web-scraping script starts over during the loop - python

Below is the code, when I run the script, it starts over from the 1st page while running. The aim is to get the post title, date, and body from each page and then click next at the bottom of each page to start the process again.
Here are the includes:
import requests
import csv
import urllib.parse as urlparse
from urllib.parse import parse_qs
from bs4 import BeautifulSoup
from selenium import webdriver
import time
browser = webdriver.Chrome('/Users/Xander/desktop/scraper/chromedriver')
URL = "https://www.jancox.com/jans-daily-news"
browser.get(URL)
URL_PAG = None
PAG = None
# Function Definition
def scrapeP(r):
count = 0
soup = BeautifulSoup(r.content, 'html5lib') # If this line causes an error, run 'pip install html5lib' or install html5lib
quotes = []
table = soup.find('div', attrs = {'class':'main-content'})
for post in table.findAll('div', attrs = {'class':'post'}):
quote = {}
quote['title'] = post.h1.text
print(quote['date_published'])
doc = browser.find_elements_by_xpath('/html/body/div/div/div[2]/div/div[1]/div/div[2]/nav/div/ul/li[2]/a')[0]
time.sleep(2)
doc.click()
URL_PAG = browser.current_url
count += 1
PAG = True
time.sleep(10)
print(count, ' - ', URL_PAG)
if(count % 10 == 0):
filename = 'post.csv'
with open(filename, 'a+', newline='') as f:
w = csv.DictWriter(f,['title', 'post', 'date_published'])
w.writeheader()
for quote in quotes:
w.writerow(quote)
quote.clear()
while True:
if(PAG == True):
browser.get(URL_PAG)
r = requests.get(URL_PAG)
print(URL_PAG)
scrapeP(r)
else:
browser.get(URL)
r = requests.get(URL)
scrapeP(r)

The loop never starts. The get request that you see is the one fired at line 3
PAG is None and you change its value inside your scrapeP function, so you never change its value outside scrapeP function's scope.
define
global pag
inside your scrapeP function to change its value from within a scrapeP
>>> PAG = None
>>> def scrapeP():
... global PAG
... PAG = True
...
>>> scrapeP()
>>> PAG
True

The issue was that I had the function to click next and print in the for loop. I needed to allow the for loop to read the entire page before clicking next.
for post in table.findAll('div', attrs = {'class':'post'}):
quote = {}
quote['title'] = post.h1.text
quote['date_published'] = post.time.text
quote['post'] = post.div.text.strip()
print(quote['date_published'])
quotes.append(quote)
time.sleep(2)
doc = browser.find_elements_by_xpath('/html/body/div/div/div[2]/div/div[1]/div/div[2]/nav/div/ul/li[2]/a')[0]
doc.click()
time.sleep(2)
URL_PAG = browser.current_url
count += 1
PAG = True
time.sleep(2)
filename = 'post.csv'
with open(filename, 'a+', newline='') as f:
w = csv.DictWriter(f,['title', 'post', 'date_published'])
w.writeheader()
for quote in quotes:
w.writerow(quote)
quote.clear()
f.close()

Related

BeautifulSoup (Python): how grab text-string next to a tag (that may or may not exist)?

I think my title explains it pretty well the problem I am facing. Let's look at a picture of the problem. (You can find the web-page at this adress, however it has probably changed).
I have highlighted the text that I want to grab in blue, this is the model-year 2008. Now, it is not necessary for the seller to submit the model-year, so this may or may not exist. But when it does exist it always follows the <i> tag with class ="fa fa-calender". My solution so far has been to grab all the text whitin <p class="result-details> ... </p>" (this then becomes a list) and then choose the second element, conditioned on that <i class="fa fa-calender> ... </i> exists. Otherwise I do not grab anything.
Now, it seems as this does not work in general since that text that comes before the second element can be aranged into more than one element if has a whitespace in it. So, is there any way (any function) that can grab a text string that neighbours another tag as seen in my picture?
PS: if I have made myself unclear, I just want to fetch the year 2008 from the post on the web page if it exists.
Edit
In this situation my code erroneously gives my the word "Hjulvältar" (bulldozer in english) instead of the year 2008.
CODE
from bs4 import BeautifulSoup
from datetime import date
import requests
url_avvikande = ['bomliftar','teleskop-bomliftar','kompakta-sjalvgaende-bomlyftar','bandschaktare','reachstackers','staplare']
today = date.today().isoformat()
url_main = 'https://www.mascus.se'
produktgrupper = ['lantbruksmaskiner','transportfordon','skogsmaskiner','entreprenadmaskiner','materialhantering','gronytemaskiner']
kategorier = {
'lantbruksmaskiner': ['traktorer','sjalvgaende-falthackar','skordetroskor','atv','utv:er','snoskotrar'],
'transportfordon': ['fordonstruckar','elektriska-fordon','terrangfordon'],
'skogsmaskiner': ['skog-skordare','skog-gravmaskiner','skotare','drivare','fallare-laggare','skogstraktorer','lunnare','terminal-lastare'],
'entreprenadmaskiner': ['gravlastare','bandgravare','minigravare-7t','hjulgravare','midigravmaskiner-7t-12t','atervinningshanterare','amfibiska-gravmaskiner','gravmaskiner-med-frontskopa','gravmaskiner-med-lang-rackvidd','gravmaskiner-med-slapskopa','rivningsgravare','specialgravmaskiner','hjullastare','kompaktlastare','minilastmaskiner','bandlastare','teleskopiska-hjullastare','redaskapshallare','gruvlastare','truckar-och-lastare-for-gruvor','bergborriggar','teleskoplastare','dumprar','minidumprar','gruvtruckar','banddumprar','specialiserade-dragare','vaghyvlar','vattentankbilar','allterrangkranar','terrangkranar-grov-terrang','-bandgaende-kranar','saxliftar','bomliftar','teleskop-bomliftar','personhissar-och-andra-hissar','kompakta-sjalvgaende-bomlyftar','krossar','mobila-krossar','sorteringsverk','mobila-sorteringsverk','bandschaktare','asfaltslaggningsmaskiner','--asfaltskallfrasmaskiner','tvavalsvaltar','envalsvaltar','jordkompaktorer','pneumatiska-hjulvaltar','andra-valtar','kombirullar','borrutrustning-ytborrning','horisontella-borrutrustning','trenchers-skar-gravmaskin'],
'materialhantering': ['dieseltruckar','eldrivna-gaffeltruckar','lpg-truckar','gaffeltruckar---ovriga','skjutstativtruck','sidlastare','teleskopbomtruckar','terminaltraktorer','reachstackers','ovriga-materialhantering-maskiner','staplare-led','staplare','plocktruck-laglyftande','plocktruck-hoglyftande','plocktruck-mediumlyftande','dragtruck','terrangtruck','4-vagstruck','smalgangstruck','skurborsttorkar','inomhus-sopmaskiner','kombinationsskurborstar'],
'gronytemaskiner': ['kompakttraktorer','akgrasklippare','robotgrasklippare','nollsvangare','plattformsklippare','sopmaskiner','verktygsfraktare','redskapsbarare','golfbilar','fairway-grasklippare','green-grasklippare','grasmattevaltar','ovriga-gronytemaskiner']
}
url = 'https://www.mascus.se'
mappar = ['Lantbruk', 'Transportfordon', 'Skogsmaskiner', 'Entreprenad', 'Materialhantering', 'Grönytemaskiner']
index = -1
status = True
for produktgrupp in kategorier:
index += 1
mapp = mappar[index]
save_path = f'/home/protector.local/vika99/webscrape_mascus/Annonser/{mapp}'
underkategorier = kategorier[produktgrupp]
for underkategori in underkategorier:
# OBS
if underkategori != 'borrutrustning-ytborrning' and status:
continue
else:
status = False
# OBS
if underkategori in url_avvikande:
url = f'{url_main}/{produktgrupp}/{underkategori}'
elif underkategori == 'gravmaskiner-med-frontskopa':
url = f'{url_main}/{produktgrupp}/begagnat-{underkategori}'
elif underkategori == 'borrutrustning-ytborrning':
url = f'{url_main}/{produktgrupp}/begagnad-{underkategori}'
else:
url = f'{url_main}/{produktgrupp}/begagnade-{underkategori}'
file_name = f'{save_path}/{produktgrupp}_{underkategori}_{today}.txt'
sida = 1
print(url)
with open(file_name, 'w') as f:
while True:
print(sida)
html_text = None
soup = None
links = None
while links == None:
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'lxml')
links = soup.find('ul', class_ = 'page-numbers')
annonser = soup.find_all('li', class_ = 'col-row single-result')
for annons in annonser:
modell = annons.find('a', class_ = 'title-font').text
if annons.p.find('i', class_ = 'fa fa-calendar') != None:
tillverkningsar = annons.find('p', class_ = 'result-details').text.strip().split(" ")[1]
else:
tillverkningsar = 'Ej angiven'
try:
pris = annons.find('span', class_ = 'title-font no-ws-wrap').text
except AttributeError:
pris = annons.find('span', class_ = 'title-font no-price').text
f.write(f'{produktgrupp:<21}{underkategori:25}{modell:<70}{tillverkningsar:<13}{pris:>14}\n')
url_part = None
sida += 1
try:
url_part = links.find('a', text = f'{sida}')['href']
except TypeError:
print(f'Avläsning av underkategori klar.')
break
url = f'{url_main}{url_part}'
As you loop the listings you can test if that calendar icon class is present, if it is then grab the next_sibling
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.mascus.se/entreprenadmaskiner/begagnade-pneumatiska-hjulvaltar')
soup = bs(r.content, 'lxml')
listings = soup.select('.single-result')
for listing in listings:
calendar = listing.select_one('.fa-calendar')
if calendar is not None:
print(calendar.next_sibling)
else:
print('Not present')

how to scrape texts from voetsmart via beautifulsoup

I am trying to scrape some statements made by U.S politicians on votesmart.org
I am experiencing errors in extracting the texts though the code could be run.
The code that I am using is as follow:
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import requests
import os
def main():
df=pd.read_csv('https://theunitedstates.io/congress-legislators/legislators-current.csv')
df = df[df.type=='sen']
df = df[~df.votesmart_id.isna()]
done_list = os.listdir('corpus')
print("{} senators".format(len(df)))
df = df[~df.full_name.isin(done_list)]
print("{} after some already done".format(len(df)))
df = df.sample(frac=1)
df.apply(scrape_politician_speeches,axis=1)
def scrape_politician_speeches(row):
print('Scraping {}...'.format(row.full_name))
vs_url='https://justfacts.votesmart.org/candidate/public-statements/{}'.format(int(row.votesmart_id))
vs_page = requests.get(vs_url) # fill in the last part of the url
soup = BeautifulSoup(vs_page.content, features="lxml")
n_pages = 1
page_num = 1
while page_num <= n_pages:
print("\tPage {} of {}".format(page_num,n_pages))
#speeches_url = vs_page.url + '?start=2019-01-01&speechType=14&p={}'.format(page_num)
speeches_url = vs_page.url + '/?s=date&start=2020/01/01&end=&p={}'.format(page_num)
speeches_page = requests.get(speeches_url)
soup = BeautifulSoup(speeches_page.content, features="lxml")
speech_table = soup.find('table', {'id':'statementsObjectsTables'})
speech_table = soup.find('tbody')
speech_links = speech_table.find_all('a',href=True)
speech_hrefs = [a.get('href') for a in speech_links]
for href in speech_hrefs:
scrape_speech(person=row.full_name, speech_url=href)
try:
n_pages = int(soup.find('h7').text.split()[-1])
except:
print("\tNo page numbers")
pass
page_num += 1
sleep(1)
def scrape_speech(person, speech_url):
try:
if not os.path.isdir('corpus/{}'.format(person)):
os.mkdir('corpus/{}'.format(person))
speech_page = requests.get(speech_url)
soup = BeautifulSoup(speech_page.content,features="lxml")
title = soup.find('h3').text
date = soup.find('span',{'itemprop':'datePublished'}).text
location = soup.find('span',{'itemprop':'contentLocation'}).text
body = soup.find('div', {'class':"main clear"})
p_list = body.find_all('p')
text_list = [p.text for p in p_list]
speech_text = '\n\n'.join(text_list)
full_text = '{}\n\n\n{}'.format(title,speech_text)
file_name = '{}, {}, {}.txt'.format(title.split(',')[0], date, location)
file_name = file_name.replace('/',' ')
with open('corpus/{}/{}'.format(person,file_name), 'w') as f:
f.write(full_text)
except:
print("\tError with {}".format(speech_url))
if __name__=='__main__':
main()
The errors are looking like this:
95 senators
95 after some already done
Scraping Tammy Duckworth...
Page 1 of 1
Error with https://votesmart.org/public-statement/1570841/durbin-duckworth-announce-135-million-for-springfield-rail-improvement-project
Error with https://votesmart.org/public-statement/1570825/durbin-duckworth-statement-on-nomination-of-ladon-reynolds-to-serve-as-us-marshal-for-the-northern-district-of-illinois
Error with https://votesmart.org/public-statement/1570826/durbin-duckworth-announce-16-million-in-telehealth-funding-for-illinois-health-care-providers
Thank you so much for your time and attention. I hope to learn more from this wonderful community.
scrape_speech is outdated, probably pages' design changed since script was writen, there's no <div class="main clear"> in html, there's no <span itemprop="datePublished"> and so on. You need to rewrite it using current css selectors.

How to add condition in while true

here i just put some code to scrape some data from website and i want to take these data from all page so i make a loop while true: and there have 550 page but i want to scrape only 10 or 20 page so how i put condition to pull 10/20 or 100 page
import requests
from bs4 import BeautifulSoup
import pandas as pd
re=requests.get("https://katmoviehd.sk/")
soup=BeautifulSoup(re.text,"html.parser")
while True:
page = soup.find_all('h2')[1:]
Category = soup.find_all('span', class_ = 'meta-category')
Category
Category_list = []
for i in Category:
Category2 = i.text
Category_list.append(Category2)
link_list = []
for i in page:
link = (i.find("a")['href'])
link_list.append(link)
title_list = []
for i in page:
title = (i.find("a")['title'])
title_list.append(title)
Table = pd.DataFrame({'Links':link_list, 'Title':title_list, 'Category':Category_list})
next_page = soup.find('a', class_ = 'next page-numbers').get('href')
next_page
url = next_page
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
Add an if and use break:
while True:
....
if time_to_quit:
break
Or use a variable instead of true:
keep_going = True
whlle keep_going:
...
keep_going = not am_i_done() # or whatever fits
Or a page count:
pages = 0
while pages < 20:
...
pages += 1
Well, you could have a simple counter variable outside your loop, and every time you successfully read a page, update the counter by 1.
Then instead of while loop with True condition, you can do something like this:
counter = 0
TOTAL_PAGES = 20 # or 100 whatever you decide
...
while counter < TOTAL_PAGES:
...
counter += 1

I try to get the backers and the home city of different projects on kickstarter

With the following code I try to get the home city and places where the backers are located from kickstarter. However, I keep running into the following error:
File "D:/location", line 60, in < module >
page1 = urllib.request.urlopen(projects[counter])
IndexError: list index out of range
Does someone have a more elegant solution to feed the page to urllib.request.urlopen? (see the lines in ** **)
code:
# coding: utf-8
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time
from datetime import datetime
from collections import OrderedDict
import re
browser = webdriver.Firefox()
browser.get('https://www.kickstarter.com/discover?ref=nav')
categories = browser.find_elements_by_class_name('category-container')
category_links = []
for category_link in categories:
#Each item in the list is a tuple of the category's name and its link.category_links.append((str(category_link.find_element_by_class_name('f3').text),
category_link.find_element_by_class_name('bg-white').get_attribute('href')))
scraped_data = []
now = datetime.now()
counter = 1
for category in category_links:
browser.get(category[1])
browser.find_element_by_class_name('sentence-open').click()
time.sleep(2)
browser.find_element_by_id('category_filter').click()
time.sleep(2)
for i in range(27):
try:
time.sleep(2)
browser.find_element_by_id('category_'+str(i)).click()
time.sleep(2)
except:
pass
#while True:
# try:
# browser.find_element_by_class_name('load_more').click()
# except:
# break
projects = []
for project_link in browser.find_elements_by_class_name('clamp-3'):
projects.append(project_link.find_element_by_tag_name('a').get_attribute('href'))
for project in projects:
**page1 = urllib.request.urlopen(projects[counter])**
soup1 = BeautifulSoup(page1, "lxml")
**page2 = urllib.request.urlopen(projects[counter].split('?')**[0]+'/community')
soup2 = BeautifulSoup(page2, "lxml")
time.sleep(2)
print(str(counter)+': '+project+'\nStatus: Started.')
project_dict = OrderedDict()
project_dict['Category'] = category[0]
browser.get(project)
project_dict['Name'] = soup1.find(class_='type-24 type-28-sm type-38-md navy-700 medium mb3').text
project_dict['Home State'] = str(soup1.find(class_='nowrap navy-700 flex items-center medium type-12').text)
try:
project_dict['Backer State'] = str(soup2.find(class_='location-list-wrapper js-location-list-wrapper').text)
except:
pass
print('Status: Done.')
counter+=1
scraped_data.append(project_dict)
later = datetime.now()
diff = later - now
print('The scraping took '+str(round(diff.seconds/60.0,2))+' minutes, and scraped '+str(len(scraped_data))+' projects.')
df = pd.DataFrame(scraped_data)
df.to_csv('kickstarter-data.csv')
If you only use counter to print the project status message, you can use range or enumerate instead. Here is an example with enumerate:
for counter, project in enumerate(projects):
... code ...
enumerate produces a tuple ( index, item ) , so the rest of your code should work fine as it is.
A fiew more things:
List index starts at 0 so when you use counter to access items you get an IndexError because you initiate counter with 1.
In the for loop you don't need projects[counter], just use project

PhantomJS not extracting links Selenium

I'm scraping a website using Selenium , Scrapy and PhantomJS. The problem with the code is , although the code scrolls the page perfectly it extracts link only upto certain limit. Beyond that it completely ignores the result of scrolling. When i use Firefox Webdriver , it is working perfectly. Since i'm running the code in server, i used PhantomJS and thus encountered the problem. Below is the code:
# -*- coding: utf-8 -*-
from scrapy.spider import BaseSpider
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import csv
import re
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
class DukeSpider(BaseSpider):
name = "dspider"
allowed_domains = ["dukemedicine.org"]
start_urls = ["http://www.dukemedicine.org/find-doctors-physicians"] #hlor
def __init__(self):
self.driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
self.driver.maximize_window()
print 'here'
def parse(self, response):
print 'nowhere'
print response
print response.url
b = open('doc_data_duke.csv', 'a')
a = csv.writer(b, lineterminator='\n')
print 'a'
self.driver.get(response.url)
time.sleep(10)
wait = WebDriverWait(self.driver, 10)
print 'helo'
click = self.driver.find_element_by_xpath("//span[#id='specialty']")
click.click()
click_again = self.driver.find_element_by_xpath("//ul[#class='doctor-type']/li[#class='ng-binding ng-scope'][2]")
click_again.click()
time.sleep(25)
act = ActionChains(self.driver)
act.move_to_element(self.driver.find_element_by_id('doctor-matrix-section')).click()
print 'now here'
for i in range(0, 75):
#self.driver.find_element_by_xpath("//div[#id='doctor-matrix-section']").send_keys(Keys.PAGE_DOWN)
#self.driver.execute_script("window.scrollBy(0, document.body.scrollHeight);")
#self.driver.find_element_by_tag_name("body").click()
#self.driver.find_element_by_tag_name("body").send_keys(Keys.PAGE_DOWN)#findElement(By.tagName("body")).sendKeys(Keys.UP);
#self.driver.find_element_by_tag_name("body").send_keys(Keys.END)
#bg = self.driver.find_element_by_css_selector('body')
#bg.send_keys(Keys.SPACE)
act.send_keys(Keys.PAGE_DOWN).perform()
time.sleep(2)
print i
i += 1
links = self.driver.find_elements_by_xpath("//div[#class = 'result-information']/div[#class='name']/a")
for l in links:
print l
doc_list = l.get_attribute('href')
if re.match(r'https:\/\/www\.dukemedicine\.org\/find-doctors-physicians\/#!\/(.*)', doc_list):
print doc_list
dr = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
dr.maximize_window()
dr.get(doc_list)
try:
name_title = dr.find_element_by_xpath('//div[#class="header1 ng-binding"]').text
name_titles = name_title.split(",", 1)
name = name_titles[0].encode('utf-8')
title = name_titles[1]
print name.encode('utf-8')
title = title[1:].encode('utf-8')
print title.encode('utf-8')
except:
name = ''
title = ''
try:
speciality = dr.find_element_by_xpath('//p[#class="specialties ng-scope"]').text
except:
speciality = ''
try:
language = dr.find_element_by_xpath(
'//div[#class="lang ng-scope"]/div[#class="plainText inline ng-binding"]').text
except:
language = ''
if dr.find_elements_by_xpath('//div[#class="location-info"]'):
locations = dr.find_elements_by_xpath('//div[#class="location-info"]')
if len(locations) >= 3:
locationA = locations[0].text.encode('utf-8')
locationA = locationA.replace('Directions', '')
locationA = locationA.replace('\n', '')
locationB = locations[1].text.encode('utf-8')
locationB = locationB.replace('Directions', '')
locationB = locationB.replace('\n', '')
locationC = locations[2].text.encode('utf-8')
locationC = locationC.replace('\n', '')
locationC = locationC.replace('Directions', '')
elif len(locations) == 2:
locationA = locations[0].text.encode('utf-8')
locationA = locationA.replace('Directions', '')
locationA = locationA.replace('\n', '')
locationB = locations[1].text.encode('utf-8')
locationB = locationA.replace('Directions', '')
locationB = locationB.replace('\n', '')
locationC = ''
elif len(locations) == 1:
locationA = locations[0].text.encode('utf-8')
locationA = locationA.replace('Directions', '')
locationA = locationA.replace('\n', '')
locationB = ''
locationC = ''
else:
locationA = ''
locationB = ''
locationC = ''
dr.close()
data = [title, name, speciality, language, locationA, locationB, locationC]
print 'aaaa'
print data
a.writerow(data)
No matter what higher value i set in the range , it ignores result beyond a certain point.
Let's use the fact that there is an element having the total result count:
The idea is to iteratively scroll into view of the last found doctor until we've got all doctors loaded.
Implementation (with clarifying comments, leaving only relevant "selenium" specific part):
# -*- coding: utf-8 -*-
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--load-images=false'])
# driver = webdriver.Chrome()
driver.maximize_window()
driver.get("http://www.dukemedicine.org/find-doctors-physicians")
# close optional survey popup if exists
try:
driver.find_element_by_css_selector("area[alt=close]").click()
except NoSuchElementException:
pass
# open up filter dropdown
click = driver.find_element_by_id("specialty")
click.click()
# choose specialist
specialist = driver.find_element_by_xpath("//ul[#class = 'doctor-type']/li[contains(., 'specialist')]")
specialist.click()
# artificial delay: TODO: fix?
time.sleep(15)
# read total results count
total_count = int(driver.find_element_by_id("doctor-number").text)
# get the initial results count
results = driver.find_elements_by_css_selector("div.doctor-result")
current_count = len(results)
# iterate while all of the results would not be loaded
while current_count < total_count:
driver.execute_script("arguments[0].scrollIntoView();", results[-1])
results = driver.find_elements_by_css_selector("div.doctor-result")
current_count = len(results)
print "Current results count: %d" % current_count
# report total results
print "----"
print "Total results loaded: %d" % current_count
driver.quit()
Works for me perfectly in both PhantomJS and Chrome. Here is what I get on the console:
Current results count: 36
Current results count: 54
Current results count: 72
Current results count: 90
...
Current results count: 1656
Current results count: 1674
Current results count: 1692
Current results count: 1708
----
Total results loaded: 1708
Additionally note I've added --load-images=false command-line argument that actually speeds things up dramatically.

Categories