I am trying to scrap the Reddit Data using Python. The result I got is only for a single subreddit information not for the complete list.
What I Tried:
import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
driver = webdriver.Chrome(r"C:\Users\MSZ\Reddit-scrap\chromedriver")
url="https://www.reddit.com/"
driver.get(url)
Communities=[]
#content = driver.page_source
time.sleep(15)
driver.find_element("id", "header-search-bar").send_keys("BTC")
time.sleep(5)
driver.find_element("id", "header-search-bar").send_keys(Keys.ENTER)
time.sleep(5)
community=driver.find_element('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[1]/div/div[1]/a[3]/button')
community.click()
time.sleep(10)
colist=driver.find_elements('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/h6')
for comunity in colist:
#getting all the Communities
Name=comunity.find_element('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/h6')
Members=comunity.find_element('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/p/span')
Description=comunity.find_element('xpath', '//*[#id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/p')
# Saving community info
community_info = [Name.text, Members.text, Description.text]
Communities.append(community_info)
driver.quit()
communitydf = pd.DataFrame(Communities)
communitydf.columns = ['Community', 'Members', 'Description']
communitydf.to_csv('community_details.csv', index=False)
time.sleep(5)
What I Want:
The above code only fetches the first record, but I want to access all the subreddits which I get from the search query. I am new to Python and I think I mix-up the logic.
Any help will be appreciated.
Firstly, you do not wait for all the communities to load, for this you need to scroll the page to the end. Secondly, you are looking for the same xpath, which ll always only point to a specific element
import time
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
def ger_communities(name: str):
driver = webdriver.Chrome(r"C:\Users\MSZ\Reddit-scrap\chromedriver")
url = f"https://www.reddit.com/search/?q={name}&type=sr"
driver.get(url)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
communities = []
soup = BeautifulSoup(driver.page_source, 'lxml')
for x in soup.find('div', {'data-testid': 'communities-list'}).find_all('a', {'data-testid': 'subreddit-link'}):
communities.append({
'Name': x.find('h6').get_text(),
'Members': x.find('span').get_text(),
'Description': x.find_all('p')[-1].get_text()
})
return communities
df = pd.DataFrame(ger_communities('BTC'))
df.to_csv('community_details.csv', index=False)
But i reccomend use Reddit API
Related
I'm working on a project and I'm trying to extract the pictures' URL from a website.
I try multiple solution on this below code but no solution help me out.
In this I want separated products images.
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium import webdriver
driver = webdriver.Chrome('D:/chromedrive/chromedriver.exe')
wshoes = []
url= "https://www.nike.com/gb/w/womens-lifestyle-shoes-13jrmz5e1x6zy7ok"
driver.get(url)
SCROLL_PAUSE_TIME = 1
time.sleep(4)
time.sleep(7)
# Get scroll height
"""last_height = driver.execute_script("return document.body.scrollHeight")
this doesn't work due to floating web elements on youtube
"""
last_height = driver.execute_script("return document.documentElement.scrollHeight")
conte = None
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0,document.documentElement.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height and conte:
print("break")
break
last_height = new_height
time.sleep(5)
pageSource = driver.page_source
soup = BeautifulSoup(pageSource, 'html.parser')
conte = soup.find_all('div',class_='product-card__body')
images = conte.find('img', {'class':'product-card__hero-image css-1fxh5tw'},src=True)
print(images)
You can try using selenium:
images = driver.find_elements_by_tag_name('img')
for image in images:
imageClass = image.get_attribute('class')
if imageClass == 'product-card__hero-image css-1fxh5tw':
print(image)
I want to scrape all the comments of a YouTube video using selenium but able to scrape only first 20. Don't getting what's wrong with the following code -
imports required
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
initialisation
driver = webdriver.Chrome()
url = 'https://www.youtube.com/watch?v=etzmAZ7oiz0'
driver.get(url)
time.sleep(3)
final_comment_list = []
author_list = []
comment_list = []
while loop for scrolling down the page
last_height = driver.execute_script("return document.body.scrollHeight")
html = driver.find_element(By.TAG_NAME, 'html')
while True:
print("Scroll down to bottom")
# Scroll down to bottom
html.send_keys(Keys.PAGE_DOWN)
# Wait to load the page
time.sleep(5)
# find author name and author comment
try:
authors_list_el = driver.find_elements(By.CSS_SELECTOR,
'#author-text.yt-simple-endpoint.style-scope.ytd-comment-renderer span.style-scope.ytd-comment-renderer')
author_list = [x.text for x in authors_list_el]
except:
print(f"not able to find author for {url} video")
try:
comments = driver.find_elements(By.CSS_SELECTOR, '#content.style-scope.ytd-expander')
comment_list = [x.text for x in comments]
except:
print(f"not able to find comments for {url} video")
# creating dictionary object and adding to list
obj1 = dict(author_list=author_list, comment_list=comment_list)
final_comment_list.append(obj1)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
else:
last_height = new_height
printing the result
print(final_comment_list)
print(len(author_list))
I am trying to make a scraping application to scrape Hants.gov.uk and right now I am working on it just clicking the pages instead of scraping. When it gets to the last row on page 1 it just stopped, so what I did was make it click button "Next Page" but first it has to go back to the original URL. It clicks page 2, but after page 2 is scraped it doesn't go to page 3, it just restarts page 2.
Can somebody help me fix this issue?
Code:
import time
import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome(executable_path=r"C:\Users\Goten\Desktop\chromedriver.exe")
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result = []
for link in links:
if link not in result:
result.append(link)
else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
start()
driver.get(url)
for i in range(5):
driver.find_element_by_id("ctl00_mainContentPlaceHolder_lvResults_bottomPager_ctl02_NextButton").click()
url = driver.current_url
start()
driver.get(url)
driver.close()
try this:
import time
# import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
result = []
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result.extend(links)
def start2():
for link in result:
# if link not in result:
# result.append(link)
# else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
while True:
start()
element = driver.find_element_by_class_name('rdpPageNext')
try:
check = element.get_attribute('onclick')
if check != "return false;":
element.click()
else:
break
except:
break
print(result)
start2()
driver.get(url)
As per the url https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True to click through all the pages you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get('https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "mainContentPlaceHolder_btnAccept"))).click()
numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div#ctl00_mainContentPlaceHolder_lvResults_topPager div.rdpWrap.rdpNumPart>a"))))
print(numLinks)
for i in range(numLinks):
print("Perform your scrapping here on page {}".format(str(i+1)))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='ctl00_mainContentPlaceHolder_lvResults_topPager']//div[#class='rdpWrap rdpNumPart']//a[#class='rdpCurrentPage']/span//following::span[1]"))).click()
driver.quit()
Console Output:
8
Perform your scrapping here on page 1
Perform your scrapping here on page 2
Perform your scrapping here on page 3
Perform your scrapping here on page 4
Perform your scrapping here on page 5
Perform your scrapping here on page 6
Perform your scrapping here on page 7
Perform your scrapping here on page 8
hi #Feitan Portor you have written the code absolutely perfect the only reason that you are redirected back to the first page is because you have given url = driver.current_url in the last for loop where it is the url that remains static and only the java script that instigates the next click event so just remove url = driver.current_url and driver.get(url)
and you are good to go i have tested my self
also to get the current page that your scraper is in just add this part in the for loop so you will get to know where your scraper is :
ss = driver.find_element_by_class_name('rdpCurrentPage').text
print(ss)
Hope this solves your confusion
**Update**
===================================================
Ok guys, so far so good. I have code that allows me to scrape images, but it stores them in a strange way. It downloads first 40+ images, then creates another 'kittens' folder within previously created 'kittens' folder and starts over (downloading the same images as in first folder). How can I change it? Here is the code:
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup as soup
import requests
import time
import os
image_tags = []
driver = webdriver.Chrome()
driver.get(url='https://www.pexels.com/search/kittens/')
last_height = driver.execute_script('return document.body.scrollHeight')
while True:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(1)
new_height = driver.execute_script('return document.body.scrollHeight')
if new_height == last_height:
break
else:
last_height = new_height
sp = soup(driver.page_source, 'html.parser')
for img_tag in sp.find_all('img'):
image_tags.append(img_tag)
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
with open('kitten-{}.jpg'.format(x), 'wb') as f:
f.write(requests.get(url).content)
x += 1
except:
pass
===========================================================================
im trying to write a spider to scrape images of kittens from some page. I've got small problem, because my spider only gets first 15 images. I know it's probably because the page is loading more images after scrolling down. How can I resolve this issue?
Here is the code:
import requests
from bs4 import BeautifulSoup as bs
import os
url = 'https://www.pexels.com/search/cute%20kittens/'
page = requests.get(url)
soup = bs(page.text, 'html.parser')
image_tags = soup.findAll('img')
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
with open('kitten-' + str(x) + '.jpg', 'wb') as f:
f.write(requests.get(url).content)
f.close()
x += 1
except:
pass
Since the site is dynamic, you need to use a browser manipulation tool such as selenium:
from selenium import webdriver
from bs4 import BeautifulSoup as soup
import time
import os
driver = webdriver.Chrome()
driver.get('https://www.pexels.com/search/cute%20kittens/')
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.5)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
image_urls = [i['src'] for i in soup(driver.page_source, 'html.parser').find_all('img')]
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
with open('kittens.txt') as f:
for url in image_urls:
f.write('{}\n'.format(url))
I am attempting to scrape the urls of the products on an OldNavy webpage. However, it is only giving parts of the products list instead of the whole thing (for example, giving only 8 urls when there are way more than 8). I was hoping someone could help and identify what the problem may be.
from bs4 import BeautifulSoup
from selenium import webdriver
import html5lib
import platform
import urllib
import urllib2
import json
link = http://oldnavy.gap.com/browse/category.do?cid=1035712&sop=true
base_url = "http://www.oldnavy.com"
driver = webdriver.PhantomJS()
driver.get(link)
html = driver.page_source
soup = BeautifulSoup(html, "html5lib")
bigDiv = soup.findAll("div", class_="sp_sm spacing_small")
for div in bigDiv:
links = div.findAll("a")
for i in links:
j = j + 1
productUrl = base_url + i["href"]
print productUrl
This page uses JavaScript to load elements but it loads only when you scroll down page.
It is called "lazy loading".
You have to scroll page too.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
link = "http://oldnavy.gap.com/browse/category.do?cid=1035712&sop=true"
base_url = "http://www.oldnavy.com"
driver = webdriver.PhantomJS()
driver.get(link)
# ---
# scrolling
lastHeight = driver.execute_script("return document.body.scrollHeight")
#print(lastHeight)
pause = 0.5
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(pause)
newHeight = driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
break
lastHeight = newHeight
#print(lastHeight)
# ---
html = driver.page_source
soup = BeautifulSoup(html, "html5lib")
#driver.find_element_by_class_name
divs = soup.find_all("div", class_="sp_sm spacing_small")
for div in divs:
links = div.find_all("a")
for link in links:
print base_url + link["href"]
Idea: https://stackoverflow.com/a/28928684/1832058