Struggling to scrape a telegram public channel with BeautifulSoup - python

I'm practicing web scraping with BeautifulSoup but I struggle to finish printing a dictionary including the items I've scraped
The web targeted can be any telegram public channel (web version) and I pretend to collect and add as part of the dictionary the text message, timestamp, views and image url (if exist attached to the post).
I've inspected the code for the 4 elements but the one related to the image url has no class or span, so I've ended scraping them it via regex. The other 3 elements are easily retrievable.
Let's go by parts:
Importing modules
from bs4 import BeautifulSoup
import requests
import re
Function to get the images url from the public channel
def pictures(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
link = str(soup.find_all('a', class_ = 'tgme_widget_message_photo_wrap')) #converted to str in order to be able to apply regex
image_url = re.findall(r"https://cdn4.*.*.jpg", link)
return image_url
Soup to get the text message, timestamp and views
picture_list = pictures(url)
url = "https://t.me/s/computer_science_and_programming"
channel = requests.get(url).text
soup = BeautifulSoup(channel, 'lxml')
tgpost = soup.find_all('div', class_ ='tgme_widget_message')
full_message = {}
for content in tgpost:
full_message['views'] = content.find('span', class_ = 'tgme_widget_message_views').text
full_message['timestamp'] = content.find('time', class_ = 'time').text
full_message['text'] = content.find('div', class_ = 'tgme_widget_message_text').text
print(full_message)
I would really appreciate if someone can help me, I'm new to Python and I don't know how I could do it to
Check if the post contains an image and if so, add it to the dictionary
Print the dictionary including image_url as key and the url as value for each post.
Thank you very much

I think you want something like this.
from bs4 import BeautifulSoup
import requests, re
url = "https://t.me/s/computer_science_and_programming"
channel = requests.get(url).text
soup = BeautifulSoup(channel, 'lxml')
tgpost = soup.find_all('div', class_ ='tgme_widget_message')
full_message = {}
for content in tgpost:
full_message['views'] = content.find('span', class_ = 'tgme_widget_message_views').text
full_message['timestamp'] = content.find('time', class_ = 'time').text
full_message['text'] = content.find('div', class_ = 'tgme_widget_message_text').text
if content.find('a', class_ = 'tgme_widget_message_photo_wrap') != None :
link = str(content.find('a', class_ = 'tgme_widget_message_photo_wrap'))
full_message['url_image'] = re.findall(r"https://cdn4.*.*.jpg", link)[0]
elif 'url_image' in full_message:
full_message.pop('url_image')
print(full_message)

Related

Pulling p tags from multiple URLs

I've struggled on this for days and not sure what the issue could be - basically, I'm trying to extract the profile box data (picture below) of each link -- going through inspector, I thought I could pull the p tags and do so.
I'm new to this and trying to understand, but here's what I have thus far:
-- a code that (somewhat) succesfully pulls the info for ONE link:
import requests
from bs4 import BeautifulSoup
# getting html
url = 'https://basketball.realgm.com/player/Darius-Adams/Summary/28720'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
playerinfo = container.find_all('p')
print(playerinfo)
I then also have a code that pulls all of the HREF tags from multiple links:
from bs4 import BeautifulSoup
import requests
def get_links(url):
links = []
website = requests.get(url)
website_text = website.text
soup = BeautifulSoup(website_text)
for link in soup.find_all('a'):
links.append(link.get('href'))
for link in links:
print(link)
print(len(links))
get_links('https://basketball.realgm.com/dleague/players/2022')
get_links('https://basketball.realgm.com/dleague/players/2021')
get_links('https://basketball.realgm.com/dleague/players/2020')
So basically, my goal is to combine these two, and get one code that will pull all of the P tags from multiple URLs. I've been trying to do it, and I'm really not sure at all why this isn't working here:
from bs4 import BeautifulSoup
import requests
def get_profile(url):
profiles = []
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
for profile in container.find_all('a'):
profiles.append(profile.get('p'))
for profile in profiles:
print(profile)
get_profile('https://basketball.realgm.com/player/Darius-Adams/Summary/28720')
get_profile('https://basketball.realgm.com/player/Marial-Shayok/Summary/26697')
Again, I'm really new to web scraping with Python but any advice would be greatly appreciated. Ultimately, my end goal is to have a tool that can scrape this data in a clean way all at once.
(Player name, Current Team, Born, Birthplace, etc).. maybe I'm doing it entirely wrong but any guidance is welcome!
You need to combine your two scripts together and make requests for each player. Try the following approach. This searches for <td> tags that have the data-td=Player attribute:
import requests
from bs4 import BeautifulSoup
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th' : 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name" : name, "URL" : player_url}
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
]
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
for entry in data:
print(entry)

how to scrape data on a website with view more with beautifulsoup

i am trying to scrape news from reuters but there is a click to view more at the bottom on the website. I could not know how to load the hidden results by using beautiful soup.
from bs4 import BeautifulSoup
import urllib.request
def scrape_reuters_news(ticker):
url = "https://www.reuters.com/search/news?sortBy=relevance&dateRange=pastWeek&blob="+ticker
scraped_data = urllib.request.urlopen(url)
scraped_data = scraped_data.read()
parsed_articles = BeautifulSoup(scraped_data, 'lxml')
links = parsed_articles.find_all("h3")
articles = []
titles = []
title_class = "Text__text___3eVx1j Text__dark-grey___AS2I_p Text__medium___1ocDap Text__heading_2___sUlNJP Heading__base___1dDlXY Heading__heading_2___3f_bIW ArticleHeader__heading___3ibi0Q"
for link in links:
paragraphs = ""
url = "https://www.reuters.com/"+str(link)[41:63]
scraped_data = urllib.request.urlopen(url)
scraped_data = scraped_data.read()
parsed_article = BeautifulSoup(scraped_data, 'lxml')
article = parsed_article.find_all("p")
title = parsed_article.select("h1", {"class": title_class})
titles.append(title[0].text.strip())
for paragraph in article:
paragraphs += paragraph.text + " "
articles.append(paragraphs)
return titles, articles
# edit
ticker = "apple"
news = scrape_reuters_news(ticker)
When you click the load more a callback is issued that you can find in the network tab. If you grab the number of results from the search page, you can add this into the callback to get all results in one go. I then use regex to extract the id to reconstruct each detail page url and the title (headline)
You would then visit each link to get the paragraph info.
Please note:
There is some de-duplication work to do. There exist different ids which lead to same content. So perhaps exclude based on title?
You may need to consider whether any pre-processing of ticker needs to happen e.g. convert to lowercase, replace spaces with "-". I don't know all your use cases.
from bs4 import BeautifulSoup as bs
import requests, re
ticker = 'apple'
with requests.Session() as s:
r = s.get(f'https://www.reuters.com/search/news?sortBy=relevance&dateRange=pastWeek&blob={ticker}')
soup = bs(r.content, 'lxml')
num_results = soup.select_one('.search-result-count-num').text
r = s.get(f'https://www.reuters.com/assets/searchArticleLoadMoreJson?blob={ticker}&bigOrSmall=big&articleWithBlog=true&sortBy=relevance&dateRange=pastWeek&numResultsToShow={num_results}&pn=&callback=addMoreNewsResults')
p = re.compile(r'id: "(.*?)"')
p2 = re.compile(r'headline: "(.*?)"')
links = [f'https://www.reuters.com/article/id{i}' for i in p.findall(r.text)]
headlines = [bs(i, 'lxml').get_text() for i in p2.findall(r.text)]
print(len(links), len(headlines))
From the detail pages you can get the paragraphs with
paras = ' '.join([i.get_text() for i in soup.select('[data-testid*=paragraph-]')])

python crawling beautifulsoup how to crawl several pages?

Please Help.
I want to get all the company names of each pages and they have 12 pages.
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/2
-- this website only changes the number.
So Here is my code so far.
Can I get just the title (company name) of 12 pages?
Thank you in advance.
from bs4 import BeautifulSoup
import requests
maximum = 0
page = 1
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1'
response = requests.get(URL)
source = response.text
soup = BeautifulSoup(source, 'html.parser')
whole_source = ""
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/' + str(page_number)
response = requests.get(URL)
whole_source = whole_source + response.text
soup = BeautifulSoup(whole_source, 'html.parser')
find_company = soup.select("#content > div.wrap_analysis_data > div.public_con_box.public_list_wrap > ul > li:nth-child(13) > div > strong")
for company in find_company:
print(company.text)
---------Output of one page
---------page source :)
So, you want to remove all the headers and get only the string of the company name?
Basically, you can use the soup.findAll to find the list of company in the format like this:
<strong class="company"><span>중소기업진흥공단</span></strong>
Then you use the .find function to extract information from the <span> tag:
<span>중소기업진흥공단</span>
After that, you use .contents function to get the string from the <span> tag:
'중소기업진흥공단'
So you write a loop to do the same for each page, and make a list called company_list to store the results from each page and append them together.
Here's the code:
from bs4 import BeautifulSoup
import requests
maximum = 12
company_list = [] # List for result storing
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(page_number)
response = requests.get(URL)
print(page_number)
whole_source = response.text
soup = BeautifulSoup(whole_source, 'html.parser')
for entry in soup.findAll('strong', attrs={'class': 'company'}): # Finding all company names in the page
company_list.append(entry.find('span').contents[0]) # Extracting name from the result
The company_list will give you all the company names you want
I figured it out eventually. Thank you for your answer though!
image : code captured in jupyter notebook
Here is my final code.
from urllib.request import urlopen
from bs4 import BeautifulSoup
company_list=[]
for n in range(12):
url = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(n+1)
webpage = urlopen(url)
source = BeautifulSoup(webpage,'html.parser',from_encoding='utf-8')
companys = source.findAll('strong',{'class':'company'})
for company in companys:
company_list.append(company.get_text().strip().replace('\n','').replace('\t','').replace('\r',''))
file = open('company_name1.txt','w',encoding='utf-8')
for company in company_list:
file.write(company+'\n')
file.close()

none returned when trying to get tag value

In this html snippet from https://letterboxd.com/shesnicky/list/top-50-favourite-films/, I'm trying to go through all the different li tags and get the info from 'data-target-link' so I can then use that to create a new link that takes me to the page for that film, however every time I try and get the data it simply returns None or an error along those lines.
<li class="poster-container numbered-list-item" data-owner-rating="10"> <div class="poster film-poster really-lazy-load" data-image-width="125" data-image-height="187" data-film-slug="/film/donnie-darko/" data-linked="linked" data-menu="menu" data-target-link="/film/donnie-darko/" > <img src="https://s3.ltrbxd.com/static/img/empty-poster-125.c6227b2a.png" class="image" width="125" height="187" alt="Donnie Darko"/><span class="frame"><span class="frame-title"></span></span> </div> <p class="list-number">1</p> </li>
I'm going to be using the links to grab imgs for a twitter bot, so I tried doing this within my code:
class BotStreamer(tweepy.StreamListener):
print "Bot Streamer"
#on_data method of Tweepy’s StreamListener
#passes data from statuses to the on_status method
def on_status(self, status):
print "on status"
link = 'https://letterboxd.com/shesnicky/list/top-50-favourite-films/'
page = requests.get(link)
soup = BS(page.content, 'html.parser')
movies_ul = soup.find('ul', {'class':'poster-list -p125 -grid film-list'})
movies = []
for mov in movies_ul.find('data-film-slug'):
movies.append(mov)
rand = randint(0,51)
newLink = "https://letterboxd.com%s" % (str(movies[rand]))
newPage = requests.get(newLink)
code = BS(newPage.content, 'html.parser')
code_div = code.find\
('div', {'class':'react-component film-poster film-poster-51910 poster'})
image = code_div.find('img')
url = image.get('src')
username = status.user.screen_name
status_id = status.id
tweet_reply(url, username, status_id)
However, I kept getting errors about list being out of range, or not being able to iterate over NoneType. So I made a test prgrm just to see if I could somehow get the data:
import requests
from bs4 import BeautifulSoup as BS
link = 'https://letterboxd.com/shesnicky/list/top-50-favourite-films/'
page = requests.get(link)
soup = BS(page.content, 'html.parser')
movies_ul = soup.find('ul', {'class':'poster-list -p125 -grid film-list'})
more = movies_ul.find('li', {'class':'poster-container numbered-list-item'})
k = more.find('data-target-link')
print k
And again, all I get is None. Any help greatly appreciated.
Read doc: find() as first argument expects tag name, not attribute.
You may do
soup.find('div', {'data-target-link': True})
or
soup.find(attrs={'data-target-link': True})
Full example
import requests
from bs4 import BeautifulSoup as BS
link = 'https://letterboxd.com/shesnicky/list/top-50-favourite-films/'
page = requests.get(link)
soup = BS(page.content, 'html.parser')
all_items = soup.find_all('div', {'data-target-link': True})
for item in all_items:
print(item['data-target-link'])

Web crawler - following links

Please bear with me. I am quite new at Python - but having a lot of fun. I am trying to code a web crawler that crawls through election results from the last referendum in Denmark. I have managed to extract all the relevant links from the main page. And now I want Python to follow each of the 92 links and gather 9 pieces of information from each of those pages. But I am so stuck. Hope you can give me a hint.
Here is my code:
import requests
import urllib2
from bs4 import BeautifulSoup
# This is the original url http://www.kmdvalg.dk/
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
my_list = []
all_links = soup.find_all("a")
for link in all_links:
link2 = link["href"]
my_list.append(link2)
for i in my_list[1:93]:
print i
# The output shows all the links that I would like to follow and gather information from. How do I do that?
Here is my solution using lxml. It's similar to BeautifulSoup
import lxml
from lxml import html
import requests
page = requests.get('http://www.kmdvalg.dk/main')
tree = html.fromstring(page.content)
my_list = tree.xpath('//div[#class="LetterGroup"]//a/#href') # grab all link
print 'Length of all links = ', len(my_list)
my_list is a list consist of all links. And now you can use for loop to scrape information inside each page.
We can for loop through each links. Inside each page, you can extract information as example. This is only for the top table.
table_information = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
table_key = tree.xpath('//td[#class="statusHeader"]/text()')
table_value = tree.xpath('//td[#class="statusText"]/text()') + tree.xpath('//td[#class="statusText"]/a/text()')
table_information.append(zip([t]*len(table_key), table_key, table_value))
For table below the page,
table_information_below = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
l1 = tree.xpath('//tr[#class="tableRowPrimary"]/td[#class="StemmerNu"]/text()')
l2 = tree.xpath('//tr[#class="tableRowSecondary"]/td[#class="StemmerNu"]/text()')
table_information_below.append([t]+l1+l2)
Hope this help!
A simple approach would be to iterate through your list of urls and parse them each individually:
for url in my_list:
soup = BeautifulSoup(urllib2.urlopen(url).read())
# then parse each page individually here
Alternatively, you could speed things up significantly using Futures.
from requests_futures.sessions import FuturesSession
def my_parse_function(html):
"""Use this function to parse each page"""
soup = BeautifulSoup(html)
all_paragraphs = soup.find_all('p')
return all_paragraphs
session = FuturesSession(max_workers=5)
futures = [session.get(url) for url in my_list]
page_results = [my_parse_function(future.result()) for future in results]
This would be my solution for your problem
import requests
from bs4 import BeautifulSoup
def spider():
url = "http://www.kmdvalg.dk/main"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('div', {'class': 'LetterGroup'}):
anc = link.find('a')
href = anc.get('href')
print(anc.getText())
print(href)
# spider2(href) call a second function from here that is similar to this one(making url = to herf)
spider2(href)
print("\n")
def spider2(linktofollow):
url = linktofollow
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('tr', {'class': 'tableRowPrimary'}):
anc = link.find('td')
print(anc.getText())
print("\n")
spider()
its not done... i only get a simple element from the table but you get the idea and how its supposed to work.
Here is my final code that works smooth. Please let me know if I could have done it smarter!
import urllib2
from bs4 import BeautifulSoup
import codecs
f = codecs.open("eu2015valg.txt", "w", encoding="iso-8859-1")
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
liste = []
alle_links = soup.find_all("a")
for link in alle_links:
link2 = link["href"]
liste.append(link2)
for url in liste[1:93]:
soup = BeautifulSoup(urllib2.urlopen(url).read().decode('iso-8859-1'))
tds = soup.findAll('td')
stemmernu = soup.findAll('td', class_='StemmerNu')
print >> f, tds[5].string,";",tds[12].string,";",tds[14].string,";",tds[16].string,";", stemmernu[0].string,";",stemmernu[1].string,";",stemmernu[2].string,";",stemmernu[3].string,";",stemmernu[6].string,";",stemmernu[8].string,";",'\r\n'
f.close()

Categories