Unsure why beautifulsoup code won't scrape site - python

I've used BS a fair bit, but I'm unsure why this won't scrape as the other addons I've made for Kodi work fine. Could someone perhaps look at the code between the tags and perhaps find the bit I'm missing?
The addon/python doesn't throw out any error, it just provides an empty GUI screen. If the title or image scraping were fine and the link wasn't, then it would show a title/image but the link wouldn't work when clicked. So it's obviously the title/image part. I've even tried hashing out the image section so it just looks for a link and title, but still nothing.
Link being scraped: https://store.counterpunch.org/feed/podcast/
def get_soup1(url1):
page = requests.get(url1)
soup1 = BeautifulSoup(page.text, 'html.parser')
print("type: ", type(soup1))
return soup1
get_soup1("https://store.counterpunch.org/feed/podcast/")
def get_playable_podcast1(soup1):
subjects = []
for content in soup1.find_all('item', limit=9):
try:
link = content.find('enclosure')
link = link.get('url')
print("\n\nLink: ", link)
title = content.find('title')
title = title.get_text()
except AttributeError:
continue
item = {
'url': link,
'title': title,
'thumbnail': "https://is2-ssl.mzstatic.com/image/thumb/Podcasts71/v4/71/55/88/71558834-c449-9ac3-e327-cad002e305b4/mza_4409042347411679857.jpg/600x600bb.jpg",
}
subjects.append(item)
return subjects
def compile_playable_podcast1(playable_podcast1):
items = []
for podcast in playable_podcast1:
items.append({
'label': podcast['title'],
'thumbnail': podcast['thumbnail'],
'path': podcast['url'],
'is_playable': True,
})
return items

You need a User-Agent
def get_soup1(url1):
page = requests.get(url1, headers = {'User-Agent':'Mozilla/5.0'})
soup1 = BeautifulSoup(page.text, 'html.parser')
print("type: ", type(soup1))
return soup1

Related

selinium slow preformance on realtime Django project

I'm working on a project which I'm getting posts from couple of websites and show it in the main page of my website but with filter and letting users to search for keywords and see the posts with that keywords.
this is how the code works :
In here with get the customized url of the site with our keyword and city filter
def link_gen(city='', Kword='' ):
# for example.com
urls =[]
if Kword != '':
if city =='':
url = f'https://www.example.com/search/with{Kword}'
url = url.strip()
url = url.replace(" ", "-")
urls.append(url)
else:
url = f'https://www.example.com/search/with{Kword}in{city}'
url = url.strip()
url = url.replace(" ", "-")
urls.append(url)
else:
if city != '':
url = f'https://www.example.com/search/in{city}'
url = url.strip()
url = url.replace(" ", "-")
urls.append(url)
else: urls.append('none')
return urls
this part is where we crawl for the posts of the target website
# function for getting the title, link, icon, desc of all posts
def get_cards(urls):
data = []
# for example.com
if urls[0] != 'none':
# we use webdriver to get site with dynamic component and design
url = urls[0]
options = Options()
options.headless = True
browser = webdriver.Firefox(options=options)
browser.get(url)
print ("Headless Firefox Initialized")
soup = BeautifulSoup(browser.page_source, 'html.parser')
jobs = soup.find_all( 'div', class_="job-list-item", limit=3)
# looping through all the cards
for job in jobs :
# get the title, link, icon, desc
title = job.find('a', class_= "title vertical-top display-inline" ).text
icon = job.find(tage_name_img)['src']
link = job.find('a', class_= "title vertical-top display-inline" )['href']
date = job.find('div', class_= "date" ).text
data.append(dict(
title = title,
icon = f'https://www.example.com/{icon}',
link = f'https://www.example.com/{link}',
date = date,
site = 'example'
))
browser.close()
return data
but the problem is for getting the post and dynamic tags on the websites I needed to use selenium I can't use session.get(url) because it won't return all the tags
and with selenium it takes for ever to return the posts even though I only crawl 3 posts
but I think webdriver uses a lot of resources.
I ran out of ram when I when I tried to run it locally
any suggestion would be so much appreciated

Python scrape google search results

I am trying to scrape all the data of the google search results - title , URL and description.
However, I cant grab the description of the search results, it returns an empty string.
# check Chrome version: Menue (the three dots - upper right corner -> Help -> About Google Chrome)
# download ChromeDriver according to the Chrome version (example version 79)
# download from https://sites.google.com/a/chromium.org/chromedriver/downloads
# place the chromedriver.exe file in the current working directory
# pip install selenium
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from bs4.element import Tag
import pandas as pd
import random
keywords = pd.read_csv('keywords.csv', header=0, index_col=None)
df = pd.DataFrame(columns=['keyword', 'title', 'url', 'description'])
for i in keywords['keyword']:
# Scraper that gives bacck: titles, links, descriptions
driver = webdriver.Chrome()
google_url = "https://www.google.com/search?gl=US&q=" + i + "&num=" + str(10)
driver.get(google_url)
time.sleep(random.randrange(15,50))
soup = BeautifulSoup(driver.page_source,'lxml')
result_div = soup.find_all('div', attrs={'class': 'g'})
links = []
titles = []
descriptions = []
for r in result_div:
# Checks if each element is present, else, raise exception
try:
link = r.find('a', href=True)
title = None
title = r.find('h3')
if isinstance(title,Tag):
title = title.get_text()
description = None
description = r.find('span', attrs={'class': 'st'})
if isinstance(description, Tag):
description = description.get_text()
# Check to make sure everything is present before appending
if link != '' and title != '' and description != '':
links.append(link['href'])
titles.append(title)
descriptions.append(description)
# Next loop if one element is not present
except Exception as e:
print(e)
continue
for link, title, description in zip(links, titles, descriptions):
df = df.append({'keyword': i, 'title': title, 'url': link, 'description': description}, ignore_index=True)
df.to_csv(r'final_dataset.csv', index=False)
Anyone has an idea how to grab the description in the google search results.
Get the description node with the following code.
description = r.select('.aCOpRe span:not(.f)')
Also, you can use requests instead of selenium. The full example is in online IDE.
from requests import Session
from bs4 import BeautifulSoup
from bs4.element import Tag
import pandas as pd
keywords = pd.read_csv('keywords.csv', header=0, index_col=None)
df = pd.DataFrame(columns=['keyword', 'title', 'url', 'description'])
for i in keywords['keyword']:
# Scraper that gives back: titles, links, descriptions
params = {"q": i, 'gl': 'US', 'num': 10}
headers = {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 Edg/80.0.361.62"
}
with Session() as session:
r = session.get(
"https://google.com/search", params=params, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
result_div = soup.find_all('div', attrs={'class': 'g'})
links = []
titles = []
descriptions = []
for r in result_div:
# Checks if each element is present, else, raise exception
try:
link = r.find('a', href=True)
title = r.find('h3')
if isinstance(title, Tag):
title = title.get_text()
description = r.select('.aCOpRe span:not(.f)')
if isinstance(description, Tag):
description = description.get_text()
# Check to make sure everything is present before appending
if link != '' and title != '' and description != '':
links.append(link['href'])
titles.append(title)
descriptions.append(description)
# Next loop if one element is not present
except Exception as e:
print(e)
continue
for link, title, description in zip(links, titles, descriptions):
df = df.append({
'keyword': i,
'title': title,
'url': link,
'description': description
}, ignore_index=True)
df.to_csv(r'final_dataset.csv', index=False)
Alternatively, you can extract data from Google Search via SerpApi.
Disclaimer: I work at SerpApi.

BeautifulSoup generating inconsistent results

I'm using BeautifulSoup to pull data out of Reddit sidebars on a selection of subreddits, but my results are changing pretty much every time I run my script.
Specifically, the results in sidebar_urls changes from iteration to iteration; sometimes it will result in [XYZ.com/abc, XYZ.com/def], other times it will return just [XYZ.com/def], and finally, it will sometimes return [].
Any ideas why this might be happening using the code below?
sidebar_urls = []
for i in range(0, len(reddit_urls)):
req = urllib.request.Request(reddit_urls[i], headers=headers)
resp = urllib.request.urlopen(req)
soup = BeautifulSoup(resp, 'html.parser')
links = soup.find_all(href=True)
for link in links:
if "XYZ.com" in str(link['href']):
sidebar_urls.append(link['href'])
It seems you sometimes get a page that does not have a side bar. It could be because Reddit is recognizing you as a robot and returning a default page instead of the one you expect. Consider identifying yourself when requesting the pages, using the User-Agent field:
reddit_urls = [
"https://www.reddit.com/r/leagueoflegends/",
"https://www.reddit.com/r/pokemon/"
]
# Update this to identify yourself
user_agent = "me#example.com"
sidebar_urls = []
for reddit_url in reddit_urls:
response = requests.get(reddit_url, headers={"User-Agent": user_agent})
soup = BeautifulSoup(response.text, "html.parser")
# Find the sidebar tag
side_tag = soup.find("div", {"class": "side"})
if side_tag is None:
print("Could not find a sidebar in page: {}".format(reddit_url))
continue
# Find all links in the sidebar tag
link_tags = side_tag.find_all("a")
for link in link_tags:
link_text = str(link["href"])
sidebar_urls.append(link_text)
print(sidebar_urls)

How to make my crawler parse data from start page

I've written some code in python to grab details from a torrent site. However, when I run my code I found the results as I expected. The only problem with this crawler is that it skips the content of first page [as the pagination urls start from 2] which I can't fix. Any help on this will be highly appreciable.
import requests
from lxml import html
page_link = "https://yts.ag/browse-movies"
b_link = "https://yts.ag"
def get_links(main_link):
response = requests.get(main_link).text
tree = html.fromstring(response)
for item in tree.cssselect('ul.tsc_pagination a'):
if "page" in item.attrib["href"]:
movie_details(b_link + item.attrib["href"])
def movie_details(link):
response = requests.get(link).text
tree = html.fromstring(response)
for titles in tree.cssselect("div.browse-movie-wrap"):
title = titles.cssselect('div.browse-movie-bottom a.browse-movie-title')[0].text
link = titles.cssselect('div.browse-movie-year')[0].text
rating= titles.cssselect('figcaption.hidden-xs h4.rating')[0].text
genre = titles.cssselect('figcaption.hidden-xs h4')[0].text
genre1 = titles.cssselect('figcaption.hidden-xs h4')[1].text
print(title, link, rating, genre, genre1)
get_links(page_link)
Why not just call the movie_details() function on the main_link before the loop ?
def get_links(main_link):
response = requests.get(main_link).text
tree = html.fromstring(response)
movie_details(main_link)
for item in tree.cssselect('ul.tsc_pagination a'):
if "page" in item.attrib["href"]:
movie_details(b_link + item.attrib["href"])

Scrape through website and iterate over seach results to get specific data

I'm trying to work on a project to scrape www.boattrader.com to push 800 listings with the Make, Price, and Phone Number of each boat to a CSV file.
I'm looking for guidance on the best way to scrape the links to each boat listing from the search results and then parse through each individual page to grab the Make, Price and Phone number.
Any guidance would be much appreciated it!
Thanks again!
from bs4 import BeautifulSoup, SoupStrainer
import requests
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
for link in possible_links:
if link.has_attr('href'):
boat_links = link.attrs['href']
return boat_links
search_results = 'http://www.boattrader.com/search-results/NewOrUsed-any/Type-all/Zip-90007/Radius-2000/Sort-Length:DESC/Page-1,50'
boat_links = extract_from_search(search_results)
print boat_links #why does this only print one link? What would be the best way to iterate over the search results, so I can put those links into the boat_listing variable to grab the information I'm looking for?
def extract_from_listing(boat_listing):
r = requests.get(boat_listing)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
table_heads = soup.find_all('th')
for th in table_heads:
if th.text =="Make":
make = th.find_next_sibling("td").text
price = soup.find('span', {'class': 'bd-price'})
formatted_price = price.string.strip()
contact_info = soup.find('div', {'class': 'phone'})
reversed_phone = contact_info.string[::-1]
temp_phone = reversed_phone.replace(')', '}')
temp_phone2 = temp_phone.replace('(', ')')
correct_phone = temp_phone2.replace("}", "(")
return make, formatted_price, correct_phone
boat_listing = 'http://www.boattrader.com/listing/2009-Briggs-BR9134-Sportfish-102290211'
make, price, phone = extract_from_listing(boat_listing)
print make
print price
print phone
You are only returning the last link, you need to append:
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
boat_links = [] # create list to append all inks to
for link in possible_links:
if link.has_attr('href'):
boat_links.append(link.attrs['href']) # append each link
return boat_links
Or use a list comp:
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.content # use content to let requests handle the decoding
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
return [link.attrs['href'] for link in possible_links if link.has_attr('href')]

Categories