selinium slow preformance on realtime Django project - python

I'm working on a project which I'm getting posts from couple of websites and show it in the main page of my website but with filter and letting users to search for keywords and see the posts with that keywords.
this is how the code works :
In here with get the customized url of the site with our keyword and city filter
def link_gen(city='', Kword='' ):
# for example.com
urls =[]
if Kword != '':
if city =='':
url = f'https://www.example.com/search/with{Kword}'
url = url.strip()
url = url.replace(" ", "-")
urls.append(url)
else:
url = f'https://www.example.com/search/with{Kword}in{city}'
url = url.strip()
url = url.replace(" ", "-")
urls.append(url)
else:
if city != '':
url = f'https://www.example.com/search/in{city}'
url = url.strip()
url = url.replace(" ", "-")
urls.append(url)
else: urls.append('none')
return urls
this part is where we crawl for the posts of the target website
# function for getting the title, link, icon, desc of all posts
def get_cards(urls):
data = []
# for example.com
if urls[0] != 'none':
# we use webdriver to get site with dynamic component and design
url = urls[0]
options = Options()
options.headless = True
browser = webdriver.Firefox(options=options)
browser.get(url)
print ("Headless Firefox Initialized")
soup = BeautifulSoup(browser.page_source, 'html.parser')
jobs = soup.find_all( 'div', class_="job-list-item", limit=3)
# looping through all the cards
for job in jobs :
# get the title, link, icon, desc
title = job.find('a', class_= "title vertical-top display-inline" ).text
icon = job.find(tage_name_img)['src']
link = job.find('a', class_= "title vertical-top display-inline" )['href']
date = job.find('div', class_= "date" ).text
data.append(dict(
title = title,
icon = f'https://www.example.com/{icon}',
link = f'https://www.example.com/{link}',
date = date,
site = 'example'
))
browser.close()
return data
but the problem is for getting the post and dynamic tags on the websites I needed to use selenium I can't use session.get(url) because it won't return all the tags
and with selenium it takes for ever to return the posts even though I only crawl 3 posts
but I think webdriver uses a lot of resources.
I ran out of ram when I when I tried to run it locally
any suggestion would be so much appreciated

Related

Web Crawler Looping the URL to crawl many pages

I am lost with making a loop to go through all of the pages on this book site. The url ends in 'all?page=' followed by the page number, so it should be easy I thought, but I'm stuck. All the info gathering works fine, I just don't know how to move to the next pages. Any help would be appreciated.
import requests
from bs4 import BeautifulSoup
URL = 'https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page=' +str(page)
page = 1
page += 1
for page in max_pages:
html = requests.get(URL)
soup = BeautifulSoup(html.content, "html.parser")
# ^This part I need help with^
# results = all books present on page
# books = each individual book on the page
results = soup.find(class_='tab search')
books = results.find_all('div', class_='book-item')
for book in books:
title = book.h3.a
author = book.p.span
# in case there is no rating on a book
if len(book.find('div','rating-wrap').findAll('span', 'full-star')) == None:
pass
else: rating = len(book.find('div','rating-wrap').findAll('span', 'full-star'))
publish_date = book.find(class_='published')
format = book.find(class_='format')
price = book.find('span', class_='sale-price').text.strip()
# if there is no discount
if book.find(class_='rrp') == None:
pass
else:
original_price = book.find(class_='rrp').text.strip()
if book.find(class_='price-save') == None:
pass
else:
discount = book.find(class_='price-save').text.strip()
# unneeded text removed such as 'US' before the price shown
price = price.replace('US', '')
original_price = original_price.replace('US', '')
discount = discount.replace('Save US', '')
# .text.strip() gets text and rids of empty spaces
print(title.text.strip())
print(author.text.strip())
print(rating, 'stars')
print(publish_date.text.strip())
print(format.text.strip())
print(price)
print(original_price)
print(discount, 'in savings!')
What the code does is it loops 5 times in this case with page going up one every singe time.
max_pages = 5
for page in range(max_pages):
URL = f"https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page={page}"
html = requests.get(URL)
soup = BeautifulSoup(html.content, "html.parser")

Problem. python scrape with requests + selenium

CODE IS HERE
Hi guys
I have some problem with scraping this dynamic site (https://kvartiry-bolgarii.ru/)
I need to get all the links to the home sale ads
I used selenium to load the page and get links to ads after that I move the page down to load new ads. After the new ads are loaded, I start to parse all the links on the page and write them to the list again.
But the data in the list is not updated and the script continues to work with the links that were on the page before scrolling down.
By the way, I set a check so that the script is executed until the last announcement on the site appears in the list, the link to which I found out in advance
How can this problem be corrected?
def get_link_info():
try:
url = "https://kvartiry-bolgarii.ru/"
driver = webdriver.Chrome(
executable_path=r'C:\Users\kk\Desktop\scrape_house\drivers\chromedriver.exe',
options=options
)
driver.get(url)
req = requests.get(url)
req.encoding = 'utf8'
soup = BeautifulSoup(req.text, "lxml")
articles = soup.find_all("div", class_="content")
links_urls = []
for article in articles:
house_url = article.find("a").get("href")
links_urls.append(house_url)
#print(links_urls)
first_link_number = links_urls[-2].split("-")[-1]
first_link_number = first_link_number[1:]
#print(first_link_number)
last_link_number = links_urls[-1].split("-")[-1]
last_link_number = last_link_number[1:]
#print(last_link_number)
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
check = "https://kvartiry-bolgarii.ru/kvartira-v-elitnom-komplekse-s-unikalynym-sadom-o21751"
for a in links_urls:
if a != check:
for article in articles:
house_url = article.find("a").get("href")
links_urls.append(house_url)
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
print(links_urls[-1])
else:
print(links_urls[0], links_urls[-1])
print("all links are ready")
Some pointers. You don't need to mix selenium,requests and BeautifulSoup. Just selenium is enough. When you are scrolling infinitely, you need to remove duplicate elements before adding them to your list.
You can try this. This should work.
from selenium import webdriver
import time
def get_link_info():
all_links = []
try:
driver = webdriver.Chrome(executable_path='C:/chromedriver.exe')
driver.get('https://kvartiry-bolgarii.ru/')
time.sleep(3)
old_links = set() # Empty Set
while True:
# Scroll to get more ads
driver.execute_script("window.scrollBy(0,3825)", "")
# Wait for new ads to load
time.sleep(8)
links_divs = driver.find_elements_by_xpath('//div[#class="content"]//a') # Find Elements
ans = set(links_divs) - set(old_links) # Remove old elements
for link in ans:
# Scroll to the link.
driver.execute_script("arguments[0].scrollIntoView();", link)
fir = link.get_attribute('href')
all_links.append(fir)
# Remove Duplicates
old_links = links_divs
except Exception as e:
raise e
get_link_info()

Web Scraping multiple pages of a web site?

My problem is i am trying to scrape data form a web page with multiple web pages of witch each is a separate blog post. But the current code only scrapes the data form the url that i have set as variable source.
source = requests.get('https://www.trenerklemen.si/objave/').text.
I would like to scrape text from other url but one part is dynamic and i have no idea how to access it
source = requests.get('https://www.trenerklemen.si/?????/').text
How would i get the ???? part if it changes. Thanks for the answers.
from bs4 import BeautifulSoup
import requests
source = requests.get('https://www.trenerklemen.si/objave/').text
soup = BeautifulSoup(source,'lxml')
article = soup.find('article')
headline = article.h2.text
print(headline)
summary = article.find('div', class_='post-content').p.text
print(summary)
video = article.find('iframe', class_ ='youtube-player')['src']
video_id = video.split('/')[4]
video_id = video_id.split('?')[0]
yt_link = f'https://youtube.com/watch?v={video_id}'
print(yt_link)
from bs4 import BeautifulSoup
import requests
mainLink = "https://www.trenerklemen.si"
ALL_links = ["https://www.trenerklemen.si/objave/"]
counter = 0
while(counter < len(ALL_links)):
source = requests.get(ALL_links[counter]).text
soup = BeautifulSoup(source,'lxml')
for link in soup.findAll('a'):
LinkNow = str(link.get('href'))
if(len(LinkNow) > 0):
if(LinkNow not in ALL_links):
if(LinkNow[0] == '/'):
ALL_links.append("https://www."+mainLink+LinkNow)
if(mainLink in LinkNow):
ALL_links.append(LinkNow)
try:
article = soup.find('article')
headline = article.h2.text
print(headline)
summary = article.find('div', class_='post-content').p.text
print(summary)
video = article.find('iframe', class_ ='youtube-player')['src']
video_id = video.split('/')[4]
video_id = video_id.split('?')[0]
yt_link = f'https://youtube.com/watch?v={video_id}'
print(yt_link)
except Exception as e:
print("Error: "+str(e))
counter += 1
Websites usually use sitemaps for search engines to be able to crawl the content. You can use the sitemap as a source of your links to scrape.
Sitemap for your website: https://www.trenerklemen.si/post-sitemap.xml

Unsure why beautifulsoup code won't scrape site

I've used BS a fair bit, but I'm unsure why this won't scrape as the other addons I've made for Kodi work fine. Could someone perhaps look at the code between the tags and perhaps find the bit I'm missing?
The addon/python doesn't throw out any error, it just provides an empty GUI screen. If the title or image scraping were fine and the link wasn't, then it would show a title/image but the link wouldn't work when clicked. So it's obviously the title/image part. I've even tried hashing out the image section so it just looks for a link and title, but still nothing.
Link being scraped: https://store.counterpunch.org/feed/podcast/
def get_soup1(url1):
page = requests.get(url1)
soup1 = BeautifulSoup(page.text, 'html.parser')
print("type: ", type(soup1))
return soup1
get_soup1("https://store.counterpunch.org/feed/podcast/")
def get_playable_podcast1(soup1):
subjects = []
for content in soup1.find_all('item', limit=9):
try:
link = content.find('enclosure')
link = link.get('url')
print("\n\nLink: ", link)
title = content.find('title')
title = title.get_text()
except AttributeError:
continue
item = {
'url': link,
'title': title,
'thumbnail': "https://is2-ssl.mzstatic.com/image/thumb/Podcasts71/v4/71/55/88/71558834-c449-9ac3-e327-cad002e305b4/mza_4409042347411679857.jpg/600x600bb.jpg",
}
subjects.append(item)
return subjects
def compile_playable_podcast1(playable_podcast1):
items = []
for podcast in playable_podcast1:
items.append({
'label': podcast['title'],
'thumbnail': podcast['thumbnail'],
'path': podcast['url'],
'is_playable': True,
})
return items
You need a User-Agent
def get_soup1(url1):
page = requests.get(url1, headers = {'User-Agent':'Mozilla/5.0'})
soup1 = BeautifulSoup(page.text, 'html.parser')
print("type: ", type(soup1))
return soup1

URL changes while using proxy and Selenium

I am new to web scraping so please forgive my ignorance.
I built a program to scrape Zillow, and everything has worked fine for the most part. My problem is I am using a proxy service called proxycrawl that easily allows me to integrate proxies into my program. This is done by placing https://api.proxycrawl.com/?token=xxx&url= before my actual URL. What I have noticed is that when the program clicks on an "a" tag, the URL changes to the example below:
Before:
Before Click
After:
After Click
Any 11 clicks through the program or manually result in the site changing to the proxycrawl site, where I get the 404 error. Any ideas?
#Browser open
print(".....Opening Browser.....")
Browser = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')
Browser.maximize_window()
#browser page
url = urllib.parse.quote_plus('https://www.zillow.com/homes/for_sale/Bakersfield-CA-93312/house,mobile,land,townhouse_type/97227_rid/35.4606,-119.037467,35.317856,-119.200888_rect/12_zm/0_mmm/')
Browser.get('https://api.proxycrawl.com/?token=xxx&url=' + url)
print("Opening Zillow")
time.sleep(10)
last_page = int(Browser.find_element_by_xpath("""//ol[#class="zsg-pagination"]//li[last()-1]""").text)
#print last_page
page = 0
count = 0
csv_file = open('listings.csv','w')
fieldnames = ['address', 'price', 'zestimate', 'beds', 'baths', 'feet', 'desc', 'Type', 'year_built', 'heating', 'cooling', 'parking', 'lot',
'days_on_market', 'pricepsqr', 'saves', 'interior', 'spaces_amenities', 'construction', 'exterior', 'parking1', 'mls', 'other']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for i in range(last_page):
page = page + 1
n = 0
listings = Browser.find_elements_by_xpath("""//*[#id="search-results"]/ul/li""")
for i in range(len(listings)):
n = i + 1
listing_dict = {}
print("Scraping the listing number {0} on page {1}, the count is {2}".format(n, page, count))
if (count) % 11 == 0:
listings = Browser.find_elements_by_xpath('//*[#id="search-results"]/ul/li')
time.sleep(2)
try:
# Finds Listings
listings = Browser.find_elements_by_xpath("""//*[#id="search-results"]/ul/li""")
print("Looking Up listings")
# Opens Listing
listings[i].find_elements_by_tag_name('a')[0].click()
print("Opening Listing")
time.sleep(2)
# Opens "See More Tab"
Browser.find_element_by_partial_link_text('See More').click()
# Prepare for Scrape
time.sleep(2)
I did speak with proxycrawl, and they stated that the URL had to be encoded, which I did do with no luck. After encoding, I replied and got the following statement:
"You are sending your requests double encoded and your get a response of pc_status: 602. Those requests are failing and you should fix them. Please only encode the URLs once, encoding the URLs more than once will result in a failing request."
It look like the page is trying to redirect you relatively.
In this specific use case, you could hack your way around the encoding issue by doing something similar to the following
# https://api.proxycrawl.com/homes/for_sale/Test/one,two
x = driver.current_url
#/homes/for_sale/Test/one,two
r = x[26:]
# base url = https://api.proxycrawl.com/?token=xxx&url=
u = base_url + r
driver.get(u)

Categories