beginner web scraping code iteration issue

beginner web scraping code iteration issue - python

I am new to Python and would really appreciate some help!!
I have been trying to create a dictionary for assigning books to their authors, only for it to come out messy and be repeating itself.
How can I fix this?
import requests
from bs4 import BeautifulSoup
url = "https://www.banyen.com/new-arrivals/index.html"
response = requests.get(url)
html = response.content
scraped = BeautifulSoup(html,'html.parser')
results = []
article = scraped.find("div", class_="block block-system block-odd clearfix")
for i in article.find_all():
name = i.find("h2", "a href", class_="teaser-title")
author = i.find("span", class_="price-amount")
if name is not None:
if author is not None:
results.append({name:author})
print(results)

import requests
from bs4 import BeautifulSoup
import re
url = "https://www.banyen.com/new-arrivals/index.html"
response = requests.get(url)
html = response.content
scraped = BeautifulSoup(html,'html.parser')
results = []
articles = scraped.find_all("div", id=re.compile("node-"))
for i in articles:
name = i.find("h2").find('a')
author = i.find("span", class_="price-amount")
if name is not None:
if author is not None:
results.append({name.text.strip():author.text})
print(results)

Related

Pulling p tags from multiple URLs

I've struggled on this for days and not sure what the issue could be - basically, I'm trying to extract the profile box data (picture below) of each link -- going through inspector, I thought I could pull the p tags and do so.
I'm new to this and trying to understand, but here's what I have thus far:
-- a code that (somewhat) succesfully pulls the info for ONE link:
import requests
from bs4 import BeautifulSoup
# getting html
url = 'https://basketball.realgm.com/player/Darius-Adams/Summary/28720'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
playerinfo = container.find_all('p')
print(playerinfo)
I then also have a code that pulls all of the HREF tags from multiple links:
from bs4 import BeautifulSoup
import requests
def get_links(url):
links = []
website = requests.get(url)
website_text = website.text
soup = BeautifulSoup(website_text)
for link in soup.find_all('a'):
links.append(link.get('href'))
for link in links:
print(link)
print(len(links))
get_links('https://basketball.realgm.com/dleague/players/2022')
get_links('https://basketball.realgm.com/dleague/players/2021')
get_links('https://basketball.realgm.com/dleague/players/2020')
So basically, my goal is to combine these two, and get one code that will pull all of the P tags from multiple URLs. I've been trying to do it, and I'm really not sure at all why this isn't working here:
from bs4 import BeautifulSoup
import requests
def get_profile(url):
profiles = []
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
for profile in container.find_all('a'):
profiles.append(profile.get('p'))
for profile in profiles:
print(profile)
get_profile('https://basketball.realgm.com/player/Darius-Adams/Summary/28720')
get_profile('https://basketball.realgm.com/player/Marial-Shayok/Summary/26697')
Again, I'm really new to web scraping with Python but any advice would be greatly appreciated. Ultimately, my end goal is to have a tool that can scrape this data in a clean way all at once.
(Player name, Current Team, Born, Birthplace, etc).. maybe I'm doing it entirely wrong but any guidance is welcome!

You need to combine your two scripts together and make requests for each player. Try the following approach. This searches for <td> tags that have the data-td=Player attribute:
import requests
from bs4 import BeautifulSoup
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th' : 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name" : name, "URL" : player_url}
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
]
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
for entry in data:
print(entry)

how to scrape author name and author url from a webpage using python

i am trying to scrape author name and author url from the following webpage.
https://medium.com/javascript-scene/top-javascript-frameworks-and-topics-to-learn-in-2019-b4142f38df20?source=tag_archive
and i am using following code;
author_flag = 0
divs = soup.find_all('h2')
for div in divs:
author = div.find('a')
if(author is not None):
author_art.append(author.text)
author_url.append('https://medium.com'+ author.get('href'))
aurhor_flag = 1
break
if(author_flag==0):
author_art.append('Author information missing')
author_url.append('Author Url information missing')
can anyone take a look what i am doing wrong in this? As this code is not picking anything.
its is just returning blank list.
Full code:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
data = pd.read_csv('url_technology.csv')
author_art = []
author_url = []
for i in range(1):
try:
author_flag = 0
divs = soup.find_all('meta')
for div in divs:
author = div.find('span')
if(author is not None):
author_art.append(author.text)
author_url.append('https://medium.com'+author.get('href'))
aurhor_flag = 1
break
if(author_flag==0):
author_art.append('Author information missing')
author_url.append('Author Url information missing')
except:
print('no data found')
author_art = pd.DataFrame(title)
author_url = pd.DataFrame(url)
res = pd.concat([author_art, author_art] , axis=1)
res.columns = ['Author_Art', 'Author_url']
res.to_csv('combined1.csv')
print('File created successfully')
https://medium.com/javascript-scene/top-javascript-frameworks-and-topics-to-learn-in-2019-b4142f38df20?source=tag_archive---------0-----------------------
https://medium.com/job-advice-for-software-engineers/what-i-want-and-dont-want-to-see-on-your-software-engineering-resume-cbc07913f7f6?source=tag_archive---------1-----------------------
https://itnext.io/load-testing-using-apache-jmeter-af189dd6f805?source=tag_archive---------2-----------------------
https://medium.com/s/story/black-mirror-bandersnatch-a-study-guide-c46dfe9156d?source=tag_archive---------3-----------------------
https://medium.com/fast-company/the-worst-design-crimes-of-2018-56f32b027bb7?source=tag_archive---------4-----------------------
https://towardsdatascience.com/make-your-pictures-beautiful-with-a-touch-of-machine-learning-magic-31672daa3032?source=tag_archive---------5-----------------------
https://medium.com/hackernoon/the-state-of-ruby-2019-is-it-dying-509160a4fb92?source=tag_archive---------6-----------------------

One possibility how to get author Name and author URL is to parse the Ld+Json data embedded within the page:
import json
import requests
from bs4 import BeautifulSoup
url = "https://medium.com/javascript-scene/top-javascript-frameworks-and-topics-to-learn-in-2019-b4142f38df20"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
data = json.loads(soup.select_one('[type="application/ld+json"]').contents[0])
# uncomment this to print all LD+JSON data:
# print(json.dumps(data, indent=4))
print("Author:", data["author"]["name"])
print("URL:", data["author"]["url"])
Prints:
Author: Eric Elliott
URL: https://medium.com/#_ericelliott
EDIT: A function that returns Author Name/URL:
import json
import requests
from bs4 import BeautifulSoup
def get_author_name_url(medium_url):
soup = BeautifulSoup(requests.get(url).content, "html.parser")
data = json.loads(
soup.select_one('[type="application/ld+json"]').contents[0]
)
return data["author"]["name"], data["author"]["url"]
url_list = [
"https://medium.com/javascript-scene/top-javascript-frameworks-and-topics-to-learn-in-2019-b4142f38df20",
]
for url in url_list:
name, url = get_author_name_url(url)
print("Author:", name)
print("URL:", url)

I've launched a python package called medium-apis to do such tasks.
Install medium-apis
pip install medium-apis
Get you RapidAPI key. See how
Run the code:
from medium_apis import Medium
medium = Medium('YOUR_RAPIDAPI_KEY')
def get_author(url):
url_without_parameters = url.split('?')[0]
article_id = url_without_parameters.split('-')[-1]
article = medium.article(article_id=article_id)
author = article.author
author.save_info()
return author
urls = [
"https://nishu-jain.medium.com/medium-apis-documentation-3384e2d08667",
]
for url in urls:
author = get_author(url)
print('Author: ', author.fullname)
print('Profile URL: ', f'https://medium.com/#{author.username}')
Github repo: https://github.com/weeping-angel/medium-apis

Struggling to scrape a telegram public channel with BeautifulSoup

I'm practicing web scraping with BeautifulSoup but I struggle to finish printing a dictionary including the items I've scraped
The web targeted can be any telegram public channel (web version) and I pretend to collect and add as part of the dictionary the text message, timestamp, views and image url (if exist attached to the post).
I've inspected the code for the 4 elements but the one related to the image url has no class or span, so I've ended scraping them it via regex. The other 3 elements are easily retrievable.
Let's go by parts:
Importing modules
from bs4 import BeautifulSoup
import requests
import re
Function to get the images url from the public channel
def pictures(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
link = str(soup.find_all('a', class_ = 'tgme_widget_message_photo_wrap')) #converted to str in order to be able to apply regex
image_url = re.findall(r"https://cdn4.*.*.jpg", link)
return image_url
Soup to get the text message, timestamp and views
picture_list = pictures(url)
url = "https://t.me/s/computer_science_and_programming"
channel = requests.get(url).text
soup = BeautifulSoup(channel, 'lxml')
tgpost = soup.find_all('div', class_ ='tgme_widget_message')
full_message = {}
for content in tgpost:
full_message['views'] = content.find('span', class_ = 'tgme_widget_message_views').text
full_message['timestamp'] = content.find('time', class_ = 'time').text
full_message['text'] = content.find('div', class_ = 'tgme_widget_message_text').text
print(full_message)
I would really appreciate if someone can help me, I'm new to Python and I don't know how I could do it to
Check if the post contains an image and if so, add it to the dictionary
Print the dictionary including image_url as key and the url as value for each post.
Thank you very much

I think you want something like this.
from bs4 import BeautifulSoup
import requests, re
url = "https://t.me/s/computer_science_and_programming"
channel = requests.get(url).text
soup = BeautifulSoup(channel, 'lxml')
tgpost = soup.find_all('div', class_ ='tgme_widget_message')
full_message = {}
for content in tgpost:
full_message['views'] = content.find('span', class_ = 'tgme_widget_message_views').text
full_message['timestamp'] = content.find('time', class_ = 'time').text
full_message['text'] = content.find('div', class_ = 'tgme_widget_message_text').text
if content.find('a', class_ = 'tgme_widget_message_photo_wrap') != None :
link = str(content.find('a', class_ = 'tgme_widget_message_photo_wrap'))
full_message['url_image'] = re.findall(r"https://cdn4.*.*.jpg", link)[0]
elif 'url_image' in full_message:
full_message.pop('url_image')
print(full_message)

Scraping AJAX loaded content with python?

So i have function that is called when i click a button , it goes as below
var min_news_id = "68feb985-1d08-4f5d-8855-cb35ae6c3e93-1";
function loadMoreNews(){
$("#load-more-btn").hide();
$("#load-more-gif").show();
$.post("/en/ajax/more_news",{'category':'','news_offset':min_news_id},function(data){
data = JSON.parse(data);
min_news_id = data.min_news_id||min_news_id;
$(".card-stack").append(data.html);
})
.fail(function(){alert("Error : unable to load more news");})
.always(function(){$("#load-more-btn").show();$("#load-more-gif").hide();});
}
jQuery.scrollDepth();
Now i don't have much experience with javascript , but i assume its returning some json data from some sort of api at "en/ajax/more_news" .
Is there i way could directly call this api and get the json data from my python script. If Yes,how?
If not how do i scrape the content that is being generated?

You need to post the news id that you see inside the script to https://www.inshorts.com/en/ajax/more_news, this is an example using requests:
from bs4 import BeautifulSoup
import requests
import re
# pattern to extract min_news_id
patt = re.compile('var min_news_id\s+=\s+"(.*?)"')
with requests.Session() as s:
soup = BeautifulSoup(s.get("https://www.inshorts.com/en/read").content)
new_id_scr = soup.find("script", text=re.compile("var\s+min_news_id"))
print(new_id_scr.text)
news_id = patt.search(new_id_scr.text).group()
js = s.post("https://www.inshorts.com/en/ajax/more_news", data={"news_offset":news_id})
print(js.json())
js gives you all the html, you just have to access the js["html"].

Here is the script that will automatically loop through all the pages in inshort.com
from bs4 import BeautifulSoup
from newspaper import Article
import requests
import sys
import re
import json
patt = re.compile('var min_news_id\s+=\s+"(.*?)"')
i = 0
while(1):
with requests.Session() as s:
if(i==0):soup = BeautifulSoup(s.get("https://www.inshorts.com/en/read").content,"lxml")
new_id_scr = soup.find("script", text=re.compile("var\s+min_news_id"))
news_id = patt.search(new_id_scr.text).group(1)
js = s.post("https://www.inshorts.com/en/ajax/more_news", data={"news_offset":news_id})
jsn = json.dumps(js.json())
jsonToPython = json.loads(jsn)
news_id = jsonToPython["min_news_id"]
data = jsonToPython["html"]
i += 1
soup = BeautifulSoup(data, "lxml")
for tag in soup.find_all("div", {"class":"news-card"}):
main_text = tag.find("div", {"itemprop":"articleBody"})
summ_text = main_text.text
summ_text = summ_text.replace("\n", " ")
result = tag.find("a", {"class":"source"})
art_url = result.get('href')
if 'www.youtube.com' in art_url:
print("Nothing")
else:
art_url = art_url[:-1]
#print("Hello", art_url)
article = Article(art_url)
article.download()
if article.is_downloaded:
article.parse()
article_text = article.text
article_text = article_text.replace("\n", " ")
print(article_text+"\n")
print(summ_text+"\n")
It gives both the summary from inshort.com and complete news from respective news channel.

Web crawler - following links

Please bear with me. I am quite new at Python - but having a lot of fun. I am trying to code a web crawler that crawls through election results from the last referendum in Denmark. I have managed to extract all the relevant links from the main page. And now I want Python to follow each of the 92 links and gather 9 pieces of information from each of those pages. But I am so stuck. Hope you can give me a hint.
Here is my code:
import requests
import urllib2
from bs4 import BeautifulSoup
# This is the original url http://www.kmdvalg.dk/
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
my_list = []
all_links = soup.find_all("a")
for link in all_links:
link2 = link["href"]
my_list.append(link2)
for i in my_list[1:93]:
print i
# The output shows all the links that I would like to follow and gather information from. How do I do that?

Here is my solution using lxml. It's similar to BeautifulSoup
import lxml
from lxml import html
import requests
page = requests.get('http://www.kmdvalg.dk/main')
tree = html.fromstring(page.content)
my_list = tree.xpath('//div[#class="LetterGroup"]//a/#href') # grab all link
print 'Length of all links = ', len(my_list)
my_list is a list consist of all links. And now you can use for loop to scrape information inside each page.
We can for loop through each links. Inside each page, you can extract information as example. This is only for the top table.
table_information = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
table_key = tree.xpath('//td[#class="statusHeader"]/text()')
table_value = tree.xpath('//td[#class="statusText"]/text()') + tree.xpath('//td[#class="statusText"]/a/text()')
table_information.append(zip([t]*len(table_key), table_key, table_value))
For table below the page,
table_information_below = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
l1 = tree.xpath('//tr[#class="tableRowPrimary"]/td[#class="StemmerNu"]/text()')
l2 = tree.xpath('//tr[#class="tableRowSecondary"]/td[#class="StemmerNu"]/text()')
table_information_below.append([t]+l1+l2)
Hope this help!

A simple approach would be to iterate through your list of urls and parse them each individually:
for url in my_list:
soup = BeautifulSoup(urllib2.urlopen(url).read())
# then parse each page individually here
Alternatively, you could speed things up significantly using Futures.
from requests_futures.sessions import FuturesSession
def my_parse_function(html):
"""Use this function to parse each page"""
soup = BeautifulSoup(html)
all_paragraphs = soup.find_all('p')
return all_paragraphs
session = FuturesSession(max_workers=5)
futures = [session.get(url) for url in my_list]
page_results = [my_parse_function(future.result()) for future in results]

This would be my solution for your problem
import requests
from bs4 import BeautifulSoup
def spider():
url = "http://www.kmdvalg.dk/main"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('div', {'class': 'LetterGroup'}):
anc = link.find('a')
href = anc.get('href')
print(anc.getText())
print(href)
# spider2(href) call a second function from here that is similar to this one(making url = to herf)
spider2(href)
print("\n")
def spider2(linktofollow):
url = linktofollow
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('tr', {'class': 'tableRowPrimary'}):
anc = link.find('td')
print(anc.getText())
print("\n")
spider()
its not done... i only get a simple element from the table but you get the idea and how its supposed to work.

Here is my final code that works smooth. Please let me know if I could have done it smarter!
import urllib2
from bs4 import BeautifulSoup
import codecs
f = codecs.open("eu2015valg.txt", "w", encoding="iso-8859-1")
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
liste = []
alle_links = soup.find_all("a")
for link in alle_links:
link2 = link["href"]
liste.append(link2)
for url in liste[1:93]:
soup = BeautifulSoup(urllib2.urlopen(url).read().decode('iso-8859-1'))
tds = soup.findAll('td')
stemmernu = soup.findAll('td', class_='StemmerNu')
print >> f, tds[5].string,";",tds[12].string,";",tds[14].string,";",tds[16].string,";", stemmernu[0].string,";",stemmernu[1].string,";",stemmernu[2].string,";",stemmernu[3].string,";",stemmernu[6].string,";",stemmernu[8].string,";",'\r\n'
f.close()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

beginner web scraping code iteration issue - python

Related

Pulling p tags from multiple URLs

how to scrape author name and author url from a webpage using python

Struggling to scrape a telegram public channel with BeautifulSoup

Scraping AJAX loaded content with python?

Web crawler - following links

Categories

Resources