Pages take 2 loadings to complete - python

On this website, https://toptees.store/linux-funny-cloud-computing I try to scrape sold span text but this website takes 2 time load to coming complete website. That's why data is not scraped.
My Code:
import requests
from bs4 import BeautifulSoup
url = "https://toptees.store/linux-funny-cloud-computing"
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'lxml')
sold = soup.find_all("span", class_='ng-binding')
print(sold)
I also tried with selenium with Beautifulsoup
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) # ,options=options
filepath = 'urls.txt'
with open(filepath) as f:
urls = [i.strip() for i in f.readlines()]
titles = []
for url in urls:
driver.get(url)
driver.maximize_window()
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'lxml')
sold = soup.find('span', class_="ng-binding")
print(sold)
The output is coming like this [] . How can I scrape this link with Beautifulsoup?

This is one way to get that information you're after:
import time as t
from bs4 import BeautifulSoup as bs
import undetected_chromedriver as uc
options = uc.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument('--disable-notifications')
options.add_argument("--window-size=1280,720")
# options.add_argument('--headless')
browser = uc.Chrome(options=options)
url = 'https://toptees.store/linux-funny-cloud-computing'
browser.get(url)
t.sleep(1)
browser.get(url)
t.sleep(7)
soup = bs(browser.page_source, 'html.parser')
sold = soup.select_one('days-available[any-sold="campaign.sold"]')
title = soup.select_one('h1.campaign-name-title')
print(title.text, '|', [x for x in sold.text.split(' ') if len(x.strip()) > 0][0])
Result printed in terminal:
Linux funny Cloud Computing | 43
For undetected_chromedriver, please see https://pypi.org/project/undetected-chromedriver/ [instructions on how to set it up, etc]

Related

Web - Scraping on Rotten Tomatoes --- **I want to be able to scrape over 100 movies, but right now I only have scraped three. **

I should be receiving 100 different movies and their movie name, source, rating, text review, and date in the data.head(). from the website rotten tomatoes.
from bs4 import BeautifulSoup
import re
import time
import requests
#!pip install selenium
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
movie_list = ['divergent', 'top_gun', 'pursuit_of_happiness']
with open(name + "_" + ".csv", 'w',encoding='utf-8') as fw:
for movie in movie_list:
pageLink = 'https://www.rottentomatoes.com/m/'+ movie +'/reviews/'
path = "/Users/name/desktop/chromedriver"
s = Service(path)
browser = webdriver.Chrome(service=s)
browser.get(pageLink)
pageNum = 10000
for p in range(0,pageNum):
print ('page',p+1)
page_source = browser.page_source
soup = BeautifulSoup(page_source, 'lxml')
reviews=soup.findAll('div', {'class':re.compile('review_table_row')})
for review in reviews:
rating,text,date='NA','NA','NA'
rating_info =review.find('div',{'class':re.compile("review_icon")})
if rating_info:
rating = rating_info.attrs["class"][3]
print(rating)
text_info =review.find('div',{'class':re.compile("the_review")})
if text_info:
text = text_info.text.strip()
print(text)
review_date =review.find('div',{'class':re.compile("review-date subtle small")})
if review_date:
date = review_date.text.strip()
print(date)
fw.write(rating+'\t'+text+'\t'+date+'\n')
# move to the next page by clicking on the "next" button with selenium
if p < pageNum:
browser.find_element(By.XPATH,'//button[#class="js-prev-next-paging-next btn prev-next-paging__button prev-next-paging__button-right"]').click()
time.sleep(2)
#<span class="prev-next-paging__button-text">Next</span>
browser.quit()
data = pd.read_csv("your_name.csv", delimiter= "\t", header = None)
data.columns = ['Movie', 'Source','Rating', 'Text_Review', 'Date']
data.head()
I was trying to do it manually, but I think there is a faster and more efficient way to do it by web scraping... however I am not sure how. maybe by using a link that contains the top 100 movies?

Selenium parsing amazon (Python)

hello i'm trying to parse all star reviews in text (4,1,4,2,etc.)
driver.get('https://www.amazon.com/gp/new-releases/kitchen/ref=zg_bs_tab_t_bsnr')
elements=WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,".a-icon-alt")))
for i in elements:
i = i.get_attribute("innerHTML")
i= i.split(' ')[0]
list3.append(i)
i want to parse Review Stars and if Review star is not exists instead of it print something.
Try:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.set_window_size(1024, 600)
driver.maximize_window()
url = "https://www.amazon.com/gp/new-releases/kitchen/ref=zg_bs_tab_t_bsnr"
driver.get(url)
time.sleep(2)
soup=bs(driver.page_source,'html.parser')
for card in soup.findAll('span', {'class': 'aok-inline-block zg-item'}):
elem = card.find('span', {'class': 'a-icon-alt'})
if elem:
print(elem.text.split()[0])
else: print("no")

Scrape permalinks to answers posted under a certain question on Quora via Python-Selenium Web Driver

I am a beginner in Python-Selenium scraping. I want to scrape permalink of all the Quora answers posted under a question. So far I have created the following code snippet. But when I run it, it gives me only one link in the output. This is due to the fact that the page isn't loaded fully I guess. What should I do to get at least 100 permalinks to answers from the page source?
from selenium import webdriver
from selenium.webdriver.common.by import
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
driver_option = webdriver.ChromeOptions()
driver_option.add_argument(" — incognito")
chromedriver_path = './chromedriver'
def create_webdriver():
return webdriver.Chrome(executable_path=chromedriver_path, chrome_options=driver_option)
f = open('file_text.txt', 'w')
# Open the website
browser = create_webdriver()
browser.get("https://www.quora.com/How-do-I-prove-the-flat-earth-theory")
projects = browser.find_elements_by_xpath("//a[#class='answer_permalink']")
for proj in projects:
anslink = proj.get_attribute('href')
f.write(anslink)
f.close()
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.quora.com/How-do-I-prove-the-flat-earth-theory")
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.findAll("a", {'class': 'answer_permalink'}):
print(item.get("href"))
Output:
/How-do-I-prove-the-flat-earth-theory/answer/Dave-Morgan-14
/How-do-I-prove-the-flat-earth-theory/answer/Ken-Natco
/How-do-I-prove-the-flat-earth-theory/answer/Matthew-Writer
/How-do-I-prove-the-flat-earth-theory/answer/Chance-Thompson-13
/How-do-I-prove-the-flat-earth-theory/answers/27223260
/How-do-I-prove-the-flat-earth-theory/answers/26836797
/How-do-I-prove-the-flat-earth-theory/answer/Frida-Schiess
/How-do-I-prove-the-flat-earth-theory/answer/Pierre-Ripplinger
/How-do-I-prove-the-flat-earth-theory/answer/Jacob-Fu
/How-do-I-prove-the-flat-earth-theory/answer/Mike-Howells-4
/How-do-I-prove-the-flat-earth-theory/answer/Mick-Stute
/How-do-I-prove-the-flat-earth-theory/answer/Jesse-Bridges-III
/How-do-I-prove-the-flat-earth-theory/answer/Renard-Leblanc
/How-do-I-prove-the-flat-earth-theory/answers/26831140
/How-do-I-prove-the-flat-earth-theory/answers/27158717
/How-do-I-prove-the-flat-earth-theory/answer/Chris-Lockwood-4
/How-do-I-prove-the-flat-earth-theory/answer/David-Minger
/How-do-I-prove-the-flat-earth-theory/answer/Rick-Brown-50
/How-do-I-prove-the-flat-earth-theory/answer/Jacques-Malan-4
/How-do-I-prove-the-flat-earth-theory/answer/Robert-Lent-1
/How-do-I-prove-the-flat-earth-theory/answers/79419339
/How-do-I-prove-the-flat-earth-theory/answer/Dave-Consiglio
/How-do-I-prove-the-flat-earth-theory/answers/65113366
/How-do-I-prove-the-flat-earth-theory/answer/Krishnabh-Medhi
Selenium Approach:
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import time
options = Options()
options.add_argument('--headless')
driver = webdriver.Firefox(options=options)
driver.get(
'https://www.quora.com/How-do-I-prove-the-flat-earth-theory')
lenOfPage = driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
match = False
while(match == False):
lastCount = lenOfPage
lenOfPage = driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
if lastCount >= 51000:
break
soup = BeautifulSoup(driver.page_source, 'html.parser')
count = 0
for item in soup.findAll("a", {'class': 'answer_permalink'}):
count += 1
print(item.get("href"))
print(count)
driver.quit()
Output:
/How-do-I-prove-the-flat-earth-theory/answer/Dave-Morgan-14
/How-do-I-prove-the-flat-earth-theory/answer/Ken-Natco
/How-do-I-prove-the-flat-earth-theory/answer/Matthew-Writer
/How-do-I-prove-the-flat-earth-theory/answer/Chance-Thompson-13
/How-do-I-prove-the-flat-earth-theory/answers/27223260
/How-do-I-prove-the-flat-earth-theory/answers/26836797
/How-do-I-prove-the-flat-earth-theory/answer/Frida-Schiess
/How-do-I-prove-the-flat-earth-theory/answer/Pierre-Ripplinger
/How-do-I-prove-the-flat-earth-theory/answer/Jacob-Fu
/How-do-I-prove-the-flat-earth-theory/answer/Mike-Howells-4
/How-do-I-prove-the-flat-earth-theory/answer/Mick-Stute
/How-do-I-prove-the-flat-earth-theory/answer/Jesse-Bridges-III
/How-do-I-prove-the-flat-earth-theory/answer/Renard-Leblanc
/How-do-I-prove-the-flat-earth-theory/answers/26831140
/How-do-I-prove-the-flat-earth-theory/answer/Danya-Rose
/How-do-I-prove-the-flat-earth-theory/answer/Chris-Lockwood-4
/How-do-I-prove-the-flat-earth-theory/answer/David-Minger
/How-do-I-prove-the-flat-earth-theory/answer/Rick-Brown-50
/How-do-I-prove-the-flat-earth-theory/answer/Jacques-Malan-4
/How-do-I-prove-the-flat-earth-theory/answer/Robert-Lent-1
/How-do-I-prove-the-flat-earth-theory/answer/John-Lind-22
/How-do-I-prove-the-flat-earth-theory/answer/Dave-Consiglio
/How-do-I-prove-the-flat-earth-theory/answers/65113366
/How-do-I-prove-the-flat-earth-theory/answer/Krishnabh-Medhi
/How-do-I-prove-the-flat-earth-theory/answers/44569062
/How-do-I-prove-the-flat-earth-theory/answer/Abd-Ul-Rahman-Lomax
/How-do-I-prove-the-flat-earth-theory/answer/Helmut-Walle
/How-do-I-prove-the-flat-earth-theory/answer/Ed-Kohlwey-1
/How-do-I-prove-the-flat-earth-theory/answer/Jason-Ree-4
/How-do-I-prove-the-flat-earth-theory/answer/Drew-Curry
/How-do-I-prove-the-flat-earth-theory/answer/Darrel-Blakely-2
/How-do-I-prove-the-flat-earth-theory/answer/Alexander-Kunz-2
/How-do-I-prove-the-flat-earth-theory/answer/Michael-Greenberg-61
/How-do-I-prove-the-flat-earth-theory/answer/Matthew-Schenker
/How-do-I-prove-the-flat-earth-theory/answer/Gregory-Hart-8
/How-do-I-prove-the-flat-earth-theory/answer/Mark-Giammattei
/How-do-I-prove-the-flat-earth-theory/answer/Vernon-Bender
/How-do-I-prove-the-flat-earth-theory/answer/Brett-Evill
/How-do-I-prove-the-flat-earth-theory/answer/Kurt-Mager
/How-do-I-prove-the-flat-earth-theory/answer/Michael-Brenner-13
/How-do-I-prove-the-flat-earth-theory/answer/Luke-Anderson-87
/How-do-I-prove-the-flat-earth-theory/answer/Sassa-Neuf
/How-do-I-prove-the-flat-earth-theory/answer/Spandan-Mallick
/How-do-I-prove-the-flat-earth-theory/answers/58252346
/How-do-I-prove-the-flat-earth-theory/answer/Timothy-Lamothe
/How-do-I-prove-the-flat-earth-theory/answer/Eric-Schwertfeger
/How-do-I-prove-the-flat-earth-theory/answers/70843234
/How-do-I-prove-the-flat-earth-theory/answer/Tony-Flury
/How-do-I-prove-the-flat-earth-theory/answer/Aji-Jijo
/How-do-I-prove-the-flat-earth-theory/answer/Tia-Eastlake
/How-do-I-prove-the-flat-earth-theory/answer/Michael-Grace-53
/How-do-I-prove-the-flat-earth-theory/answer/Ray-Mason-30
/How-do-I-prove-the-flat-earth-theory/answer/Jimmy-May-2
/How-do-I-prove-the-flat-earth-theory/answer/Thomas-Edward-Samuel-Thomas
/How-do-I-prove-the-flat-earth-theory/answer/Alan-Atkinson-4
/How-do-I-prove-the-flat-earth-theory/answer/Joseph-Perkins-11
/How-do-I-prove-the-flat-earth-theory/answer/David-Ridlen
/How-do-I-prove-the-flat-earth-theory/answer/Charles-Li-86
/How-do-I-prove-the-flat-earth-theory/answers/140610748
/How-do-I-prove-the-flat-earth-theory/answer/Corentin-Oger
/How-do-I-prove-the-flat-earth-theory/answer/Jean-Pierre-Choisy
/How-do-I-prove-the-flat-earth-theory/answer/Tom-Kubin
/How-do-I-prove-the-flat-earth-theory/answers/120618033
/How-do-I-prove-the-flat-earth-theory/answer/Charles-Brenchley-1
/How-do-I-prove-the-flat-earth-theory/answer/Jonathan-Johnson-41
/How-do-I-prove-the-flat-earth-theory/answer/Edward-Teach-53
/How-do-I-prove-the-flat-earth-theory/answer/Tony-Price-50
/How-do-I-prove-the-flat-earth-theory/answer/Nathaniel-Day-8
/How-do-I-prove-the-flat-earth-theory/answer/Nuurussubchiy-Fikriy
/How-do-I-prove-the-flat-earth-theory/answers/150581075
/How-do-I-prove-the-flat-earth-theory/answers/87762707
/How-do-I-prove-the-flat-earth-theory/answer/Neil-219
/How-do-I-prove-the-flat-earth-theory/answer/Alex-Frantz-1
/How-do-I-prove-the-flat-earth-theory/answer/Andy-P-Zbinden
/How-do-I-prove-the-flat-earth-theory/answer/Uriel-Anderson-4
/How-do-I-prove-the-flat-earth-theory/answer/Chris-OLeary-19
/How-do-I-prove-the-flat-earth-theory/answer/Daniel-Gerber-7
/How-do-I-prove-the-flat-earth-theory/answer/Roy-Wilson-64
/How-do-I-prove-the-flat-earth-theory/answer/Randy-Wonsowicz-Jr
/How-do-I-prove-the-flat-earth-theory/answer/Leslie-Harrington-4
/How-do-I-prove-the-flat-earth-theory/answer/Eddie-Olsson
/How-do-I-prove-the-flat-earth-theory/answer/Vincent-Emery
/How-do-I-prove-the-flat-earth-theory/answer/Maxwell-Perry-3
/How-do-I-prove-the-flat-earth-theory/answer/Matthew-Granovsky
/How-do-I-prove-the-flat-earth-theory/answers/83259600
/How-do-I-prove-the-flat-earth-theory/answer/Benjamin-Dixon-17
/How-do-I-prove-the-flat-earth-theory/answer/John-Chambers-75
/How-do-I-prove-the-flat-earth-theory/answer/Ryne-Hanz
/How-do-I-prove-the-flat-earth-theory/answer/Eric-Rodriguez-137
/How-do-I-prove-the-flat-earth-theory/answer/Robert-Hopkins-90
/How-do-I-prove-the-flat-earth-theory/answer/Sasha-Maddah
/How-do-I-prove-the-flat-earth-theory/answer/Owen-Lee-126
/How-do-I-prove-the-flat-earth-theory/answer/David-Phillips-133
/How-do-I-prove-the-flat-earth-theory/answer/Hasan-Poonawala-1
/How-do-I-prove-the-flat-earth-theory/answer/Cristiano-Dal-Vi
/How-do-I-prove-the-flat-earth-theory/answer/Rex-Newborn
/How-do-I-prove-the-flat-earth-theory/answer/John-Neumann-9
/How-do-I-prove-the-flat-earth-theory/answer/Josh-D-Davis
/How-do-I-prove-the-flat-earth-theory/answer/Maruthi-Sreenath
/How-do-I-prove-the-flat-earth-theory/answer/Clint-Morgan-2
/How-do-I-prove-the-flat-earth-theory/answer/Nicholas-Volkmuth
/How-do-I-prove-the-flat-earth-theory/answer/Richard-Swim
/How-do-I-prove-the-flat-earth-theory/answers/143504277
/How-do-I-prove-the-flat-earth-theory/answer/Christer-Svanström
/How-do-I-prove-the-flat-earth-theory/answer/Steve-Schlackman-2
/How-do-I-prove-the-flat-earth-theory/answers/147597845
/How-do-I-prove-the-flat-earth-theory/answer/Rene-Dukundane-Felix
/How-do-I-prove-the-flat-earth-theory/answers/148753762
/How-do-I-prove-the-flat-earth-theory/answer/Henk-Schuring
/How-do-I-prove-the-flat-earth-theory/answers/135814117
/How-do-I-prove-the-flat-earth-theory/answer/Emilio-Trampuz
/How-do-I-prove-the-flat-earth-theory/answers/40529643
/How-do-I-prove-the-flat-earth-theory/answer/Karl-Sangree
/How-do-I-prove-the-flat-earth-theory/answer/Ted-Carriker
/How-do-I-prove-the-flat-earth-theory/answer/egi-syahban
/How-do-I-prove-the-flat-earth-theory/answer/Mayank-Dahiya-12
/How-do-I-prove-the-flat-earth-theory/answer/Robert-Jones-741
/How-do-I-prove-the-flat-earth-theory/answer/Jimmi-Carlsson-1
/How-do-I-prove-the-flat-earth-theory/answer/Cole-Johnson-24
/How-do-I-prove-the-flat-earth-theory/answer/Kram-Redarsh
/How-do-I-prove-the-flat-earth-theory/answers/64915389

How to scrape review data present in Read more in Flipkart reviews

I am trying to scrape Flipkart to extract reviews for a product using request and beautifulsoup package.how can take out data present in Read more click event present in those review.
from selenium import webdriver
from selenium.webdriver.common.by import By
from contextlib import closing
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import urllib2
import re
from bs4 import BeautifulSoup
import unicodedata
def remove_non_ascii_1(text):
return ''.join([i if ord(i) < 128 else ' ' for i in text])
with closing(Firefox()) as browser:
site = "https://www.flipkart.com/asus-zenfone-2-laser-ze550kl-black-16-gb/product-reviews/itme9j58yzyzqzgc?pid=MOBE9J587QGMXBB7"
browser.get(site)
file = open("review.txt", "w")
for count in range(1, 10):
nav_btns = browser.find_elements_by_class_name('_33m_Yg')
button = ""
for btn in nav_btns:
number = int(btn.text)
if(number==count):
button = btn
break
button.send_keys(Keys.RETURN)
WebDriverWait(browser, timeout=10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_2xg6Ul")))
read_more_btns = browser.find_elements_by_class_name('_1EPkIx')
for rm in read_more_btns:
browser.execute_script("return arguments[0].scrollIntoView();", rm)
browser.execute_script("window.scrollBy(0, -150);")
rm.click()
page_source = browser.page_source
soup = BeautifulSoup(page_source, "lxml")
ans = soup.find_all("div", class_="_3DCdKt")
for tag in ans:
title = unicode(tag.find("p", class_="_2xg6Ul").string).replace(u"\u2018", "'").replace(u"\u2019", "'")
title = remove_non_ascii_1(title)
title.encode('ascii','ignore')
content = tag.find("div", class_="qwjRop").div.prettify().replace(u"\u2018", "'").replace(u"\u2019", "'")
content = remove_non_ascii_1(content)
content.encode('ascii','ignore')
content = content[15:-7]
votes = tag.find_all("span", class_="_1_BQL8")
upvotes = int(votes[0].string)
downvotes = int(votes[1].string)
file.write("Review Title : %s\n\n" % title )
file.write("Upvotes : " + str(upvotes) + "\n\nDownvotes : " + str(downvotes) + "\n\n")
file.write("Review Content :\n%s\n\n\n\n" % content )
file.close()
Usage:
Install the requirements by running pip install bs4 selenium.
Add geckodriver to the PATH. Follow these instructions.
Put the link of the product in site variable inside the script.
Run the script by running python scrape.py.
Reviews will be saved in the file review.txt.
Had some issues using #CSMaverick code while accessing the READ MORE link. Modified the code as per my requirement.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup as bs
def get_source_code(browser):
rm_btns = browser.find_elements_by_class_name('_1BWGvX')
for rm_btn in rm_btns:
rm_btn.click()
return browser.page_source
def collect_reviews_attributes(html):
soup_obj = bs(html, "html.parser")
text_tag_divs = soup_obj.find_all('div', attrs={"class", "t-ZTKy"})
heading_tag_divs = soup_obj.find_all('p', attrs={"class", "_2-N8zT"})
rating_tag_divs = soup_obj.find_all('div', attrs={"class", "_3LWZlK _1BLPMq"})
text_tags = [tag.text for tag in text_tag_divs]
heading_tags = [tag.text for tag in heading_tag_divs]
rating_tags = [tag.text for tag in rating_tag_divs]
return list(zip(heading_tags, text_tags, rating_tags))
collector_list = []
browser = webdriver.Firefox(executable_path=r"path to\geckodriver.exe")
url = "https://www.flipkart.com/samsung-253-l-frost-free-double-door-3-star-convertible-refrigerator/product-reviews/itmf75fa1554bad3?pid=RFRFNDEEJ28SNQPG&lid=LSTRFRFNDEEJ28SNQPGEJ3YHJ&sortOrder=MOST_HELPFUL&certifiedBuyer=false&aid=overall"
num_pages = 3 # get from the url dynamically or else give large number and try hitting until u get exception
browser.get(url) # open the url in the browser
for _ in range(num_pages):
page_source_code = get_source_code(browser)
collector_list.extend(collect_reviews_attributes(page_source_code))
next_page = browser.find_elements_by_class_name('_1LKTO3')[-1] # previous and next are under same class. Access last element
next_page.click()

Selenium no another page

I am scraping one page but the problem i came up today was that the page didn`t have another page and it gave me the previous page without any error from which i could determine that page was last one..
for ex: https://example/page-7
when i want to go to: https://example/page-8 which doesn`t exist it gives me
the last page: https://example/page-7
How could i determine that https://example/page-7 was the last page using python3???
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page-1"
driver = webdriver.Firefox()
driver.get(page)
dd = driver.page_source
for j in range(100):
soup = BeautifulSoup(dd, "html.parser")
my_text = list(soup.findAll("div", class_ = "post-content"))
for i in my_text:
#collect some data
pass
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page{0}".format(j+2)
driver.get(page)
dd = driver.page_source
At first i was thinking about checking dublicates of collected data but this is too slow cause i have 30 000 links from which i have to collect data. Maybe there is easier solution??
Found the answer to my own question.
To find the page url just use driver.current_url
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request
page = "https://www.supermama.lt/forumas/topic/214375-vilma/"
driver = webdriver.Firefox()
driver.get(page)
dd = driver.page_source
current_pages = []
for j in range(100):
page_url = driver.current_url
if(page_url not in current_pages):
current_pages.append(page_url)
soup = BeautifulSoup(dd, "html.parser")
my_text = list(soup.findAll("div", class_ = "post-content"))
for i in my_text:
#collect some data
pass
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page-{0}".format(j+2)
driver.get(page)
dd = driver.page_source
else:
print(current_pages)
driver.quit()
break

Categories