Starting a new thread for each page?

Starting a new thread for each page? - python

I am trying to start a new thread for each page, but this way it starts a new thread after the other thread/function is finished.
Can anyone help me run them independent of each other?
Example:
Thread 1:
Open page 1
Thread 2:
Open page 2
And do this for X amount of pages.
I am a beginner in python so excuse my messy code.
import random
import string
import threading
from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
# driver.find_element_by_css_selector("a[onclick*='if (!window.__cfRLUnblockHandlers) return false; bail()']")
def randomStringDigits(stringLength=6):
"""Generate a random string of letters and digits """
lettersAndDigits = string.ascii_letters + string.digits
return ''.join(random.choice(lettersAndDigits) for i in range(stringLength))
def startscrape(url):
driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver.get("urlhere")
cookies_list = driver.get_cookies()
cookies_dict = {} # create dictionary
usrelem = driver.find_element_by_name("login")
usrelem.send_keys("user")
pwdelem = driver.find_element_by_name("password")
pwdelem.send_keys("pass")
pwdelem.send_keys(Keys.RETURN)
sleep(1)
driver.get(url)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
xx = soup.find("input",
{"class": "input input--number js-numberBoxTextInput input input--numberNarrow js-pageJumpPage"})
driver.get(page)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
xxx = soup.findAll("a", {"class": "js-lbImage"})
# find all thumbs
for link in xxx:
xxx = soup.find("a", {"href": link.get('href')})
dlfullimg = driver.find_element_by_xpath("//a[#href='" + xxx.get('href') + "']")
wait = WebDriverWait(driver, 10)
dlfullimg.click()
thumbs = soup.findAll("div", {"class": "lg-thumb-item"})
dlfullimg = driver.find_element_by_id('lg-download').click()
close = driver.find_element_by_xpath("//span[#class='lg-close lg-icon']").click()
sleep(1)
assert "No results found." not in driver.page_source
url = input("Main URL: ")
driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver.get("urlhere")
cookies_list = driver.get_cookies()
cookies_dict = {} # create dictionary
usrelem = driver.find_element_by_name("login")
usrelem.send_keys("user")
pwdelem = driver.find_element_by_name("password")
pwdelem.send_keys("pass")
pwdelem.send_keys(Keys.RETURN)
sleep(1)
driver.get(url)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Find page number with soup.find
xx = soup.find("input",
{"class": "input input--number js-numberBoxTextInput input input--numberNarrow js-pageJumpPage"})
driver.close()
threads = []
for i in range(int(xx.get('max'))):
page = url + "page-" + str(i + 1)
t = threading.Thread(target=startscrape(url), args=[])
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()

You can use concurrent.futures to handle the heavy lifting for you
Here's a pseudo-code to do it
import concurrent.futures
from selenium import webdriver
def process_url(url):
driver = webdriver.Chrome()
driver.get(url)
# process page
driver.close
# Find number of pages here
driver = webdriver.Chrome()
driver.get(url)
# urls = find list of urls
driver.close
threads_count = 10
with concurrent.futures.ThreadPoolExecutor(threads_count) as executor:
executor.map(process_url, urls)

Related

web scraping all universities with websites and description WHED website

anyone can help with scraping from https://www.whed.net/home.php
the code I'm using is giving me empty df. would love to have universities with websites and maybe field of study. My scraping skills are weak so if you can guide me through this would be great thanks guys.
begin=time.time()
countries=['Emirates','United States of America (all)']
result = [] # List to store all data
univ_links=[] # Links for all universities
fields = ['Street:','City:','Province:','Post Code:','WWW:','Fields of study:','Job title:']
webD = wb.Chrome(executable_path=r'C:\Users\Admin\OneDrive\Sagasit\chromedriver.exe') # To launch chrome and run script
# Trigger the target website
webD.get("https://www.whed.net/results_institutions.php")
webD.implicitly_wait(5)
#all_countries=[]
cntry_el = webD.find_elements_by_xpath('//*[#id="Chp1"]/option')
#cntry_grp = webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup')
grps=webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup/option[1]')
for c in cntry_el:countries.append(c.text)
for g in grps: countries.append(g.text)
for cntry in countries:
select = Select(webD.find_element_by_id('Chp1'))#select country dropdown
select.select_by_visible_text(cntry)#choosing country
Btn_GO = webD.find_element_by_xpath('//*[#id="fsearch"]/p/input')
Btn_GO.click()
select_rpp = Select(webD.find_element_by_name('nbr_ref_pge'))#select results per page drop down
select_rpp.select_by_visible_text('100')#choosing 100 results per page option
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li') # list of university elements
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
while True:
try:
webD.find_element_by_partial_link_text('Next').click()
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li')
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
except NoSuchElementException: break
for l in univ_links:
webD.get(l)
webD.implicitly_wait(2)
title=webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[1]').text
title_detailed = webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[2]').text
cntry_name=webD.find_element_by_xpath('//*[#id="contenu"]/p[2]').text
t1=webD.find_elements_by_class_name('dt')
t2=webD.find_elements_by_class_name('dd')
labels=webD.find_elements_by_class_name('libelle')
content=webD.find_elements_by_class_name('contenu')
temp={}
fos=''
fos1=''
temp.update({'Title': title,'Detailed Title':title_detailed,'Country':cntry_name})
for i in range(len(t1)):
if t1[i].text == '' or t1[i].text == 'Address':
continue
else:
value=t2[i].text
temp.update({t1[i].text:value.replace('\n',',')})
for j in range(len(content)):
if labels[j].text in fields:
if labels[j].text == 'Fields of study:':
info=content[j].text
fos=fos+','+info
elif labels[j].text == 'Job title:':
info1=content[j].text
fos1=fos1+','+info1
else:
key=labels[j].text
temp.update({key[:-1]: content[j].text})
temp.update({'Fields of study': fos.lstrip(','),'Job titles':fos1.lstrip(',')})
result.append(temp)
data=pd.DataFrame(result)
data
end=time.time()
print("Time taken : "+ str(end-begin) +"s")
data.to_csv("WHED1.csv",index=False)
this code what i could use taken from github project.
would be great if i can re-create the data and save it, want this to be used as a dropdown in a web application just to make sure no mistakes written in the university studied in.

Update 1/12/22 - Async
Found a much better solution using aiohttp, it also runs the entire list of countries in ~30 seconds instead of 3 hours
import json
import time
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
def main():
print("Init")
driver = init_driver()
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Gathering Countries")
countries = get_countries(driver)
driver.quit()
print("Scraping")
start = time.time()
institution_list = asyncio.run(fetch_all(countries))
print("Writing out")
f = open('output.json', 'w')
f.write(json.dumps(institution_list))
f.close()
end = time.time()
print(f"Total time: {end - start}s")
def init_driver():
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
return driver
def get_countries(driver):
select = Select(driver.find_element(By.ID, "Chp1"))
countries = list(map(lambda c: c.get_attribute('value'), select.options))
countries.pop(0)
return countries
def extract_institutions(html, country):
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
print(str(page))
number_of_institutions = str(page).split()[0]
if number_of_institutions == 'No':
print(f"No results for {country}")
return []
results = []
inst_index = 0
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
inst_index += 1
return {
'country': country,
'count': number_of_institutions,
'records': results
}
async def get_institutions(country, session):
try:
async with session.post(
url='https://www.whed.net/results_institutions.php',
data={"Chp1": country, "nbr_ref_pge": 10000}
) as response:
html = await response.read()
print(f"Successfully got {country}")
return extract_institutions(html, country)
except Exception as e:
print(f"Unable to get {country} due to {e.__class__}.")
async def fetch_all(countries):
async with aiohttp.ClientSession() as session:
return await asyncio.gather(*[get_institutions(country, session) for country in countries])
# Main call
main()
Old answer using synchronous algorithm
Improving on #Mithun's answer since it doesn't really work as it'll be stuck on the same page.
Also added direct access to the name and url to make it easier in case you want to access those.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
print("Init")
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Selecting country")
select = Select(driver.find_element(By.ID, "Chp1"))
country = "Albania"
select.select_by_visible_text(country)
time.sleep(.5)
print("Searching")
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
print("Parsing")
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
results = []
while True:
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
print(f'{len(results)}/{number_of_pages}')
if counter >= int(number_of_pages):
break
counter += 10
driver.find_element(By.LINK_TEXT, "Next page").click()
time.sleep(0.5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()
print(results)

You can use Selenium to scrape data. The following code will help you scrape the university names for "United States of America (all)". Similarly, you can scrape for other countries as well using Loop or entering the name manually. If you need the field of study for every university, you can scrape its href using bs4 and its field of study.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
driver = webdriver.Chrome(r"chromedriver.exe")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
select = Select(driver.find_element(By.ID, "Chp1"))
select.select_by_visible_text("United States of America (all)")
time.sleep(1)
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
while counter < int(number_of_pages):
raw = soup.find_all('div', {'class': 'details'})
for i in raw:
i = (str(i.text).lstrip())
i = i.replace("\n","")
i = i.replace("\r", "")
i = i.replace("\t", "")
print(i)
next_page = driver.find_element(By.LINK_TEXT, "Next page").click()
counter += 10
driver.quit()

Python scraping pages after hitting the "load more results" button

How do you scrape a web page with infinite scrolling?
My first try was using Selenium, but it detects as robot:
from selenium import webdriver
import time
import pandas as pd
url = 'https://www.bloomberg.com/search?query=indonesia%20mining'
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(options=options)
driver.get(url)
html = driver.page_source.encode('utf-8')
page_num = 0
while driver.find_elements_by_css_selector('.contentWell__a8d28605a5'):
driver.find_element_by_css_selector('.contentWell__a8d28605a5').click()
page_num += 1
print("getting page number "+str(page_num))
time.sleep(1)
html = driver.page_source.encode('utf-8')
soup = BeautifulSoup(html, 'lxml')
titles = soup.find_all('div', {"class":"text__d88756958e withThumbnail__c4ffc902a6"})
df = pd.DataFrame(columns=['judul', 'link'])
news = {}
for t in titles:
news['judul'] = t.find('a', {'class':'headline__96ba1917df'}).text.strip()
news['link'] = t.find('a', {'class':'headline__96ba1917df'}).get('href')
df = df.append(news, ignore_index=True)
any idea how to limit the maximum page number?

How to I go through a for() loop without opening webdriver.Chrome() repeatedly?

My current code is:
import time
import numpy as np
import requests
from bs4 import BeautifulSoup as soup
from selenium import webdriver
first_page_url = 'https://store.steampowered.com/tags/en/Action/#p=0&tab=NewReleases'
first_url = 'https://store.steampowered.com/tags/en/Action/#p='
rest_url = '&tab=NewReleases'
driver = webdriver.Chrome()
driver.get(first_page_url)
soup_page = driver.page_source
print(type(soup_page))
soup_page = soup(driver.page_source,'lxml')
page_numbers = soup_page.find('div',{'id':'NewReleases_ctn'})
page_numbers = page_numbers.text.split()
#print(page_numbers)
last_page_number = page_numbers[13]
print(last_page_number)
last_page_number = last_page_number.replace(',','')
last_page_number = int(last_page_number)
last_page_number = last_page_number / 15
print(last_page_number)
last_page_number = round(last_page_number)
pages = range(0,last_page_number)
index_number = 0
nana = 1
for page in pages:
page = first_url + str(pages[index_number]) + rest_url
print(type(page))
driver = webdriver.Chrome()
driver.get(page)
time.sleep(5)
soup_page = driver.page_source
#print(type(soup_page))
soup_page = soup(driver.page_source,'lxml')
new_releases = soup_page.find('div',{'id':'NewReleasesRows'})
containers = new_releases.find_all('a',{'class':'tab_item'})
container = containers[0]
driver.close()
for container in containers:
title = container.find('div',{'class':'tab_item_name'}).text
print(nana,title)
nana = nana + 1
index_number =index_number + 1
The code works as intended except, that every time it loops through the first for loop, it opens the driver.Chrome().
Is there a way of using the opened browser, again and again, other than opening a new one every time it loops through?

driver object is created global and you are again invoking chrome in commented line. So remove that line and all pages will work in same chrome.
for page in pages:
page = first_url + str(pages[index_number]) + rest_url
print(type(page))
// driver = webdriver.Chrome()
driver.get(page)

Would loop all 71 pages instead. Without the need for grabbing any page number or etc.
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get(first_page_url)
while True:
try:
wait.until(EC.element_to_be_clickable((By.XPATH,"//span[not(#class='pagebtn disabled')and #id='NewReleases_btn_next']"))).click()
except:
break
Import
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

Python - Selenium next page

I am trying to make a scraping application to scrape Hants.gov.uk and right now I am working on it just clicking the pages instead of scraping. When it gets to the last row on page 1 it just stopped, so what I did was make it click button "Next Page" but first it has to go back to the original URL. It clicks page 2, but after page 2 is scraped it doesn't go to page 3, it just restarts page 2.
Can somebody help me fix this issue?
Code:
import time
import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome(executable_path=r"C:\Users\Goten\Desktop\chromedriver.exe")
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result = []
for link in links:
if link not in result:
result.append(link)
else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
start()
driver.get(url)
for i in range(5):
driver.find_element_by_id("ctl00_mainContentPlaceHolder_lvResults_bottomPager_ctl02_NextButton").click()
url = driver.current_url
start()
driver.get(url)
driver.close()

try this:
import time
# import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
result = []
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result.extend(links)
def start2():
for link in result:
# if link not in result:
# result.append(link)
# else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
while True:
start()
element = driver.find_element_by_class_name('rdpPageNext')
try:
check = element.get_attribute('onclick')
if check != "return false;":
element.click()
else:
break
except:
break
print(result)
start2()
driver.get(url)

As per the url https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True to click through all the pages you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get('https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "mainContentPlaceHolder_btnAccept"))).click()
numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div#ctl00_mainContentPlaceHolder_lvResults_topPager div.rdpWrap.rdpNumPart>a"))))
print(numLinks)
for i in range(numLinks):
print("Perform your scrapping here on page {}".format(str(i+1)))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='ctl00_mainContentPlaceHolder_lvResults_topPager']//div[#class='rdpWrap rdpNumPart']//a[#class='rdpCurrentPage']/span//following::span[1]"))).click()
driver.quit()
Console Output:
8
Perform your scrapping here on page 1
Perform your scrapping here on page 2
Perform your scrapping here on page 3
Perform your scrapping here on page 4
Perform your scrapping here on page 5
Perform your scrapping here on page 6
Perform your scrapping here on page 7
Perform your scrapping here on page 8

hi #Feitan Portor you have written the code absolutely perfect the only reason that you are redirected back to the first page is because you have given url = driver.current_url in the last for loop where it is the url that remains static and only the java script that instigates the next click event so just remove url = driver.current_url and driver.get(url)
and you are good to go i have tested my self
also to get the current page that your scraper is in just add this part in the for loop so you will get to know where your scraper is :
ss = driver.find_element_by_class_name('rdpCurrentPage').text
print(ss)
Hope this solves your confusion

crawl playstore with python and phantomjs

This code will crawl again from the beginning every time an error occurs. I want to change this to crawl only new text, not just from the beginning.
and I would like to ask for further advice.
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup, Comment
import pandas as pd
#Setting up Chrome webdriver Options
#chrome_options = webdriver.ChromeOptions()
#setting up local path of chrome binary file
#chrome_options.binary_location = "/Users/Norefly/chromedriver2/chromedriver.exec"
#creating Chrome webdriver instance with the set chrome_options
driver = webdriver.PhantomJS("C:/Python/phantomjs-2.1.1-windows/bin/phantomjs.exe")
link = "https://play.google.com/store/apps/details?id=com.supercell.clashofclans&hl=en"
driver.get(link)
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
Ptitle = driver.find_element_by_class_name('id-app-title').text.replace(' ','')
print(Ptitle)
#driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]').click()
sleep(1)
driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
#select_newest.select_by_visible_text('Newest')
#driver.find_element_by_xpath('//*[#id="body- content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
sleep(2)
#driver.find_element_by_css_selector('.review-filter.id-review-sort-filter.dropdown-menu-container').click()
driver.find_element_by_css_selector('.displayed-child').click()
#driver.find_element_by_xpath("//button[#data-dropdown-value='1']").click()
driver.execute_script("document.querySelectorAll('button.dropdown-child')[0].click()")
reviews_df = []
for i in range(1,10):
try:
for elem in driver.find_elements_by_class_name('single-review'):
print(str(i))
content = elem.get_attribute('outerHTML')
soup = BeautifulSoup(content, "html.parser")
#print(soup.prettify())
date = soup.find('span',class_='review-date').get_text()
rating = soup.find('div',class_='tiny-star')['aria-label'][6:7]
title = soup.find('span',class_='review-title').get_text()
txt = soup.find('div',class_='review-body').get_text().replace('Full Review','')[len(title)+1:]
print(soup.get_text())
temp = pd.DataFrame({'Date':date,'Rating':rating,'Review Title':title,'Review Text':txt},index=[0])
print('-'*10)
reviews_df.append(temp)
#print(elem)
except:
print('what i can do?')
driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
#driver.execute_script("document.querySelectorAll('button.dropdown-child')[0].click()")
#driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
reviews_df = pd.concat(reviews_df,ignore_index=True)
reviews_df.to_csv(Ptitle+'review_google.csv', encoding='utf-8')
driver.close()
And I wonder if this is a problem with phantom js

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Starting a new thread for each page? - python

Related

web scraping all universities with websites and description WHED website

Python scraping pages after hitting the "load more results" button

How to I go through a for() loop without opening webdriver.Chrome() repeatedly?

Python - Selenium next page

crawl playstore with python and phantomjs

Categories

Resources