How do you scrape a web page with infinite scrolling?
My first try was using Selenium, but it detects as robot:
from selenium import webdriver
import time
import pandas as pd
url = 'https://www.bloomberg.com/search?query=indonesia%20mining'
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(options=options)
driver.get(url)
html = driver.page_source.encode('utf-8')
page_num = 0
while driver.find_elements_by_css_selector('.contentWell__a8d28605a5'):
driver.find_element_by_css_selector('.contentWell__a8d28605a5').click()
page_num += 1
print("getting page number "+str(page_num))
time.sleep(1)
html = driver.page_source.encode('utf-8')
soup = BeautifulSoup(html, 'lxml')
titles = soup.find_all('div', {"class":"text__d88756958e withThumbnail__c4ffc902a6"})
df = pd.DataFrame(columns=['judul', 'link'])
news = {}
for t in titles:
news['judul'] = t.find('a', {'class':'headline__96ba1917df'}).text.strip()
news['link'] = t.find('a', {'class':'headline__96ba1917df'}).get('href')
df = df.append(news, ignore_index=True)
any idea how to limit the maximum page number?
Related
I am trying to scrape basic information on google. The code that I am using is the following. Unfortunately it does not move to the next page and I am not figuring the reason why. I am using selenium and google chrome as browser (no firefox). Could you please tell me what is wrong in my code?
driver.get('https://www.google.com/advanced_search?q=google&tbs=cdr:1,cd_min:3/4/2020,cd_max:3/4/2020&hl=en')
search = driver.find_element_by_name('q')
search.send_keys('tea')
search.submit()
soup = BeautifulSoup(driver.page_source,'lxml')
result_div = soup.find_all('div', attrs={'class': 'g'})
titles = []
while True:
next_page_btn =driver.find_elements_by_xpath("//a[#id='pnnext']")
for r in result_div:
if len(next_page_btn) <1:
print("no more pages left")
break
else:
try:
title = None
title = r.find('h3')
if isinstance(title,Tag):
title = title.get_text()
print(title)
if title != '' :
titles.append(title)
except:
continue
element =WebDriverWait(driver,5).until(expected_conditions.element_to_be_clickable((By.ID,'pnnext')))
driver.execute_script("return arguments[0].scrollIntoView();", element)
element.click()
I set q in the query string to be an empty string. Used as_q not q for the search box name. And reordered your code a bit. I put a page limit in to stop it going on forever.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
driver = webdriver.Chrome()
driver.get('https://www.google.com/advanced_search?q=&tbs=cdr:1,cd_min:3/4/2020,cd_max:3/4/2020&hl=en')
search = driver.find_element_by_name('as_q')
search.send_keys('tea')
search.submit()
titles = []
page_limit = 5
page = 0
while True:
soup = BeautifulSoup(driver.page_source, 'lxml')
result_div = soup.find_all('div', attrs={'class': 'g'})
for r in result_div:
for title in r.find_all('h3'):
title = title.get_text()
print(title)
titles.append(title)
next_page_btn = driver.find_elements_by_id('pnnext')
if len(next_page_btn) == 0 or page > page_limit:
break
element = WebDriverWait(driver, 5).until(expected_conditions.element_to_be_clickable((By.ID, 'pnnext')))
driver.execute_script("return arguments[0].scrollIntoView();", element)
element.click()
page = page + 1
driver.quit()
I am trying to start a new thread for each page, but this way it starts a new thread after the other thread/function is finished.
Can anyone help me run them independent of each other?
Example:
Thread 1:
Open page 1
Thread 2:
Open page 2
And do this for X amount of pages.
I am a beginner in python so excuse my messy code.
import random
import string
import threading
from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
# driver.find_element_by_css_selector("a[onclick*='if (!window.__cfRLUnblockHandlers) return false; bail()']")
def randomStringDigits(stringLength=6):
"""Generate a random string of letters and digits """
lettersAndDigits = string.ascii_letters + string.digits
return ''.join(random.choice(lettersAndDigits) for i in range(stringLength))
def startscrape(url):
driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver.get("urlhere")
cookies_list = driver.get_cookies()
cookies_dict = {} # create dictionary
usrelem = driver.find_element_by_name("login")
usrelem.send_keys("user")
pwdelem = driver.find_element_by_name("password")
pwdelem.send_keys("pass")
pwdelem.send_keys(Keys.RETURN)
sleep(1)
driver.get(url)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
xx = soup.find("input",
{"class": "input input--number js-numberBoxTextInput input input--numberNarrow js-pageJumpPage"})
driver.get(page)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
xxx = soup.findAll("a", {"class": "js-lbImage"})
# find all thumbs
for link in xxx:
xxx = soup.find("a", {"href": link.get('href')})
dlfullimg = driver.find_element_by_xpath("//a[#href='" + xxx.get('href') + "']")
wait = WebDriverWait(driver, 10)
dlfullimg.click()
thumbs = soup.findAll("div", {"class": "lg-thumb-item"})
dlfullimg = driver.find_element_by_id('lg-download').click()
close = driver.find_element_by_xpath("//span[#class='lg-close lg-icon']").click()
sleep(1)
assert "No results found." not in driver.page_source
url = input("Main URL: ")
driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver.get("urlhere")
cookies_list = driver.get_cookies()
cookies_dict = {} # create dictionary
usrelem = driver.find_element_by_name("login")
usrelem.send_keys("user")
pwdelem = driver.find_element_by_name("password")
pwdelem.send_keys("pass")
pwdelem.send_keys(Keys.RETURN)
sleep(1)
driver.get(url)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Find page number with soup.find
xx = soup.find("input",
{"class": "input input--number js-numberBoxTextInput input input--numberNarrow js-pageJumpPage"})
driver.close()
threads = []
for i in range(int(xx.get('max'))):
page = url + "page-" + str(i + 1)
t = threading.Thread(target=startscrape(url), args=[])
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
You can use concurrent.futures to handle the heavy lifting for you
Here's a pseudo-code to do it
import concurrent.futures
from selenium import webdriver
def process_url(url):
driver = webdriver.Chrome()
driver.get(url)
# process page
driver.close
# Find number of pages here
driver = webdriver.Chrome()
driver.get(url)
# urls = find list of urls
driver.close
threads_count = 10
with concurrent.futures.ThreadPoolExecutor(threads_count) as executor:
executor.map(process_url, urls)
I am triyng to make my small webscraping script on this russian website. This script have to find all available doctors.
Had some issues to get hidden class inside "Запись" tab.
After making some research I've found Selenium module and here is my code:
url = 'https://samozapis-spb.ru/moskovskiy-rayon/ctomatologicheskaya-poliklinika-no12'
from time import sleep
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
opts = Options()
prefs = {"profile.managed_default_content_settings.images": 2}
opts.add_experimental_option("prefs", prefs)
browser = Chrome(options=opts)
browser.get(url)
elems = browser.find_elements_by_css_selector('tr')
print('running chrome')
while True:
#print(browser.session_id)
elem = browser.find_element_by_id('show_app')
elem.click()
sleep(5)
elem = browser.find_element_by_css_selector("a[class='ax list-group-item']")
elem.click()
doctors = browser.find_elements_by_css_selector("a[class='ax list-group-item']")[2:]
tallons = {}
for doc in doctors:
temp = doc.text.split('\n')
if len(temp) == 2:
tallons[temp[1]]=temp[0]
print(tallons)
browser.refresh()
sleep(300)
This script works but I don't like that it opens a browser window.
you can scrape the ajax request just search data-lid for POST body and set correct headers
s = requests.session()
page = s.get('https://samozapis-spb.ru/moskovskiy-rayon/ctomatologicheskaya-poliklinika-no12')
soup = BeautifulSoup(page.text, 'html.parser')
# get "data-lid" from the page
spec = soup.find("div", id="spec")
# do ajax request
data = {"lid": spec["data-lid"]}
headers = {"x-requested-with" : "XMLHttpRequest"}
ajax = s.post('https://samozapis-spb.ru/_api_v3/spec.php', data=data, headers=headers).json()
spec = soup.find("div", id="spec")
soup = BeautifulSoup(ajax['html'], 'html.parser')
doctors = soup.select("a[class='ax list-group-item']")[2:]
print(doctors)
I am trying to make a scraping application to scrape Hants.gov.uk and right now I am working on it just clicking the pages instead of scraping. When it gets to the last row on page 1 it just stopped, so what I did was make it click button "Next Page" but first it has to go back to the original URL. It clicks page 2, but after page 2 is scraped it doesn't go to page 3, it just restarts page 2.
Can somebody help me fix this issue?
Code:
import time
import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome(executable_path=r"C:\Users\Goten\Desktop\chromedriver.exe")
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result = []
for link in links:
if link not in result:
result.append(link)
else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
start()
driver.get(url)
for i in range(5):
driver.find_element_by_id("ctl00_mainContentPlaceHolder_lvResults_bottomPager_ctl02_NextButton").click()
url = driver.current_url
start()
driver.get(url)
driver.close()
try this:
import time
# import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
result = []
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result.extend(links)
def start2():
for link in result:
# if link not in result:
# result.append(link)
# else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
while True:
start()
element = driver.find_element_by_class_name('rdpPageNext')
try:
check = element.get_attribute('onclick')
if check != "return false;":
element.click()
else:
break
except:
break
print(result)
start2()
driver.get(url)
As per the url https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True to click through all the pages you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get('https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "mainContentPlaceHolder_btnAccept"))).click()
numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div#ctl00_mainContentPlaceHolder_lvResults_topPager div.rdpWrap.rdpNumPart>a"))))
print(numLinks)
for i in range(numLinks):
print("Perform your scrapping here on page {}".format(str(i+1)))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='ctl00_mainContentPlaceHolder_lvResults_topPager']//div[#class='rdpWrap rdpNumPart']//a[#class='rdpCurrentPage']/span//following::span[1]"))).click()
driver.quit()
Console Output:
8
Perform your scrapping here on page 1
Perform your scrapping here on page 2
Perform your scrapping here on page 3
Perform your scrapping here on page 4
Perform your scrapping here on page 5
Perform your scrapping here on page 6
Perform your scrapping here on page 7
Perform your scrapping here on page 8
hi #Feitan Portor you have written the code absolutely perfect the only reason that you are redirected back to the first page is because you have given url = driver.current_url in the last for loop where it is the url that remains static and only the java script that instigates the next click event so just remove url = driver.current_url and driver.get(url)
and you are good to go i have tested my self
also to get the current page that your scraper is in just add this part in the for loop so you will get to know where your scraper is :
ss = driver.find_element_by_class_name('rdpCurrentPage').text
print(ss)
Hope this solves your confusion
This code will crawl again from the beginning every time an error occurs. I want to change this to crawl only new text, not just from the beginning.
and I would like to ask for further advice.
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup, Comment
import pandas as pd
#Setting up Chrome webdriver Options
#chrome_options = webdriver.ChromeOptions()
#setting up local path of chrome binary file
#chrome_options.binary_location = "/Users/Norefly/chromedriver2/chromedriver.exec"
#creating Chrome webdriver instance with the set chrome_options
driver = webdriver.PhantomJS("C:/Python/phantomjs-2.1.1-windows/bin/phantomjs.exe")
link = "https://play.google.com/store/apps/details?id=com.supercell.clashofclans&hl=en"
driver.get(link)
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
Ptitle = driver.find_element_by_class_name('id-app-title').text.replace(' ','')
print(Ptitle)
#driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]').click()
sleep(1)
driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
#select_newest.select_by_visible_text('Newest')
#driver.find_element_by_xpath('//*[#id="body- content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
sleep(2)
#driver.find_element_by_css_selector('.review-filter.id-review-sort-filter.dropdown-menu-container').click()
driver.find_element_by_css_selector('.displayed-child').click()
#driver.find_element_by_xpath("//button[#data-dropdown-value='1']").click()
driver.execute_script("document.querySelectorAll('button.dropdown-child')[0].click()")
reviews_df = []
for i in range(1,10):
try:
for elem in driver.find_elements_by_class_name('single-review'):
print(str(i))
content = elem.get_attribute('outerHTML')
soup = BeautifulSoup(content, "html.parser")
#print(soup.prettify())
date = soup.find('span',class_='review-date').get_text()
rating = soup.find('div',class_='tiny-star')['aria-label'][6:7]
title = soup.find('span',class_='review-title').get_text()
txt = soup.find('div',class_='review-body').get_text().replace('Full Review','')[len(title)+1:]
print(soup.get_text())
temp = pd.DataFrame({'Date':date,'Rating':rating,'Review Title':title,'Review Text':txt},index=[0])
print('-'*10)
reviews_df.append(temp)
#print(elem)
except:
print('what i can do?')
driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
#driver.execute_script("document.querySelectorAll('button.dropdown-child')[0].click()")
#driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
reviews_df = pd.concat(reviews_df,ignore_index=True)
reviews_df.to_csv(Ptitle+'review_google.csv', encoding='utf-8')
driver.close()
And I wonder if this is a problem with phantom js