Website scraping with python 2.7 and beautifulsoup 4 - python

I have stuck at one point while scraping website "http://www.queensbronxba.com/directory/" with beautifulsoup. I'm almost done with scraping and I left only company name from the list which is found in paragraph tag. The problem is that there are more paragraph tags in the same div but I only need the first one as it gives the company name. So I need first paragraph on following div's also not just at first one. This is the code I used to srcape:
page = requests.get("http://www.queensbronxba.com/directory/")
soup = BeautifulSoup(page.content, 'html.parser')
company = soup.find(class_="boardMemberWrap")
contact = company.find_all(class_="boardMember")
info = contact[0]
print(info.prettify())
name_tags = company.select("h4")
names = [nt.get_text() for nt in company_tags]
names
company_tags = company.select("p") #here I need help to get only first paragraphs of following div containers
companies = [ct.get_text() for ct in company_tags]
companies
phone_tags = company.select('a[href^="tel"]')
phones = [pt.get_text() for pt in phone_tags]
phones
email_tags = company.select('a[href^="mailto"]')
emails = [et.get_text() for et in email_tags]
emails

import requests
from bs4 import BeautifulSoup
page = requests.get("http://www.queensbronxba.com/directory/")
soup = BeautifulSoup(page.content, 'html.parser')
company = soup.find(class_="boardMemberWrap")
contact = company.findAll(class_="boardMemberInfo")
info = contact[0]
print(info.prettify())
name_tags = company.select("h4")
names = [nt.get_text() for nt in name_tags]
print(names)
for name in company.findAll(class_="boardMember"):
for n in name.findAll('p')[:1]:
print(n.text)
phone_tags = company.select('a[href^="tel"]')
phones = [pt.get_text() for pt in phone_tags]
print(phones)
email_tags = company.select('a[href^="mailto"]')
emails = [et.get_text() for et in email_tags]
print(emails)

Related

Using Beautiful soup to find a phone number for company name and address

I have a script which scrapes a website for the name, region and province of companies in Spain. There is another link within the html, which takes you to a page that contains the phone number, but when I try to even scrape the html, it prints "none". Is there a way that the script can automatically move to the page, scrape the number and match it with the company row?
import requests
from googlesearch import search
from bs4 import BeautifulSoup
for page in range(1,65):
url = "https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/{page}.html".format(page =page)
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
lists = soup.select("div#simulacion_tabla ul")
#scrape the list
for lis in lists:
title = lis.find('li', class_="col1").text
location = lis.find('li', class_="col2").text
province = lis.find('li', class_="col3").text
link = lis.find('href', class_ ="col1")
info = [title, location, province, link]
print(info)
Alternatively, is there is a way to do it with googlesearch library?
Many thanks
first url "https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/index.html" not
"https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/1.html"
for this reason your script does not return output.
you can try like this
import requests
# from googlesearch import search
from bs4 import BeautifulSoup
baseurl = ["https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/index.html"]
urls = [f'https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/{i}.html'.format(i) for i in range(2,5)]
allurls = baseurl + urls
print(allurls)
for url in allurls:
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
lists = soup.select("div#simulacion_tabla ul")
#scrape the list
for lis in lists:
title = lis.find('li', class_="col1").text
location = lis.find('li', class_="col2").text
province = lis.find('li', class_="col3").text
link = lis.select("li.col1 a")[0]['href']
info = [title, location, province, link]
print(info)

how to scrape data on a website with view more with beautifulsoup

i am trying to scrape news from reuters but there is a click to view more at the bottom on the website. I could not know how to load the hidden results by using beautiful soup.
from bs4 import BeautifulSoup
import urllib.request
def scrape_reuters_news(ticker):
url = "https://www.reuters.com/search/news?sortBy=relevance&dateRange=pastWeek&blob="+ticker
scraped_data = urllib.request.urlopen(url)
scraped_data = scraped_data.read()
parsed_articles = BeautifulSoup(scraped_data, 'lxml')
links = parsed_articles.find_all("h3")
articles = []
titles = []
title_class = "Text__text___3eVx1j Text__dark-grey___AS2I_p Text__medium___1ocDap Text__heading_2___sUlNJP Heading__base___1dDlXY Heading__heading_2___3f_bIW ArticleHeader__heading___3ibi0Q"
for link in links:
paragraphs = ""
url = "https://www.reuters.com/"+str(link)[41:63]
scraped_data = urllib.request.urlopen(url)
scraped_data = scraped_data.read()
parsed_article = BeautifulSoup(scraped_data, 'lxml')
article = parsed_article.find_all("p")
title = parsed_article.select("h1", {"class": title_class})
titles.append(title[0].text.strip())
for paragraph in article:
paragraphs += paragraph.text + " "
articles.append(paragraphs)
return titles, articles
# edit
ticker = "apple"
news = scrape_reuters_news(ticker)
When you click the load more a callback is issued that you can find in the network tab. If you grab the number of results from the search page, you can add this into the callback to get all results in one go. I then use regex to extract the id to reconstruct each detail page url and the title (headline)
You would then visit each link to get the paragraph info.
Please note:
There is some de-duplication work to do. There exist different ids which lead to same content. So perhaps exclude based on title?
You may need to consider whether any pre-processing of ticker needs to happen e.g. convert to lowercase, replace spaces with "-". I don't know all your use cases.
from bs4 import BeautifulSoup as bs
import requests, re
ticker = 'apple'
with requests.Session() as s:
r = s.get(f'https://www.reuters.com/search/news?sortBy=relevance&dateRange=pastWeek&blob={ticker}')
soup = bs(r.content, 'lxml')
num_results = soup.select_one('.search-result-count-num').text
r = s.get(f'https://www.reuters.com/assets/searchArticleLoadMoreJson?blob={ticker}&bigOrSmall=big&articleWithBlog=true&sortBy=relevance&dateRange=pastWeek&numResultsToShow={num_results}&pn=&callback=addMoreNewsResults')
p = re.compile(r'id: "(.*?)"')
p2 = re.compile(r'headline: "(.*?)"')
links = [f'https://www.reuters.com/article/id{i}' for i in p.findall(r.text)]
headlines = [bs(i, 'lxml').get_text() for i in p2.findall(r.text)]
print(len(links), len(headlines))
From the detail pages you can get the paragraphs with
paras = ' '.join([i.get_text() for i in soup.select('[data-testid*=paragraph-]')])

Struggling to scrape a telegram public channel with BeautifulSoup

I'm practicing web scraping with BeautifulSoup but I struggle to finish printing a dictionary including the items I've scraped
The web targeted can be any telegram public channel (web version) and I pretend to collect and add as part of the dictionary the text message, timestamp, views and image url (if exist attached to the post).
I've inspected the code for the 4 elements but the one related to the image url has no class or span, so I've ended scraping them it via regex. The other 3 elements are easily retrievable.
Let's go by parts:
Importing modules
from bs4 import BeautifulSoup
import requests
import re
Function to get the images url from the public channel
def pictures(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
link = str(soup.find_all('a', class_ = 'tgme_widget_message_photo_wrap')) #converted to str in order to be able to apply regex
image_url = re.findall(r"https://cdn4.*.*.jpg", link)
return image_url
Soup to get the text message, timestamp and views
picture_list = pictures(url)
url = "https://t.me/s/computer_science_and_programming"
channel = requests.get(url).text
soup = BeautifulSoup(channel, 'lxml')
tgpost = soup.find_all('div', class_ ='tgme_widget_message')
full_message = {}
for content in tgpost:
full_message['views'] = content.find('span', class_ = 'tgme_widget_message_views').text
full_message['timestamp'] = content.find('time', class_ = 'time').text
full_message['text'] = content.find('div', class_ = 'tgme_widget_message_text').text
print(full_message)
I would really appreciate if someone can help me, I'm new to Python and I don't know how I could do it to
Check if the post contains an image and if so, add it to the dictionary
Print the dictionary including image_url as key and the url as value for each post.
Thank you very much
I think you want something like this.
from bs4 import BeautifulSoup
import requests, re
url = "https://t.me/s/computer_science_and_programming"
channel = requests.get(url).text
soup = BeautifulSoup(channel, 'lxml')
tgpost = soup.find_all('div', class_ ='tgme_widget_message')
full_message = {}
for content in tgpost:
full_message['views'] = content.find('span', class_ = 'tgme_widget_message_views').text
full_message['timestamp'] = content.find('time', class_ = 'time').text
full_message['text'] = content.find('div', class_ = 'tgme_widget_message_text').text
if content.find('a', class_ = 'tgme_widget_message_photo_wrap') != None :
link = str(content.find('a', class_ = 'tgme_widget_message_photo_wrap'))
full_message['url_image'] = re.findall(r"https://cdn4.*.*.jpg", link)[0]
elif 'url_image' in full_message:
full_message.pop('url_image')
print(full_message)

python crawling beautifulsoup how to crawl several pages?

Please Help.
I want to get all the company names of each pages and they have 12 pages.
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/2
-- this website only changes the number.
So Here is my code so far.
Can I get just the title (company name) of 12 pages?
Thank you in advance.
from bs4 import BeautifulSoup
import requests
maximum = 0
page = 1
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1'
response = requests.get(URL)
source = response.text
soup = BeautifulSoup(source, 'html.parser')
whole_source = ""
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/' + str(page_number)
response = requests.get(URL)
whole_source = whole_source + response.text
soup = BeautifulSoup(whole_source, 'html.parser')
find_company = soup.select("#content > div.wrap_analysis_data > div.public_con_box.public_list_wrap > ul > li:nth-child(13) > div > strong")
for company in find_company:
print(company.text)
---------Output of one page
---------page source :)
So, you want to remove all the headers and get only the string of the company name?
Basically, you can use the soup.findAll to find the list of company in the format like this:
<strong class="company"><span>중소기업진흥공단</span></strong>
Then you use the .find function to extract information from the <span> tag:
<span>중소기업진흥공단</span>
After that, you use .contents function to get the string from the <span> tag:
'중소기업진흥공단'
So you write a loop to do the same for each page, and make a list called company_list to store the results from each page and append them together.
Here's the code:
from bs4 import BeautifulSoup
import requests
maximum = 12
company_list = [] # List for result storing
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(page_number)
response = requests.get(URL)
print(page_number)
whole_source = response.text
soup = BeautifulSoup(whole_source, 'html.parser')
for entry in soup.findAll('strong', attrs={'class': 'company'}): # Finding all company names in the page
company_list.append(entry.find('span').contents[0]) # Extracting name from the result
The company_list will give you all the company names you want
I figured it out eventually. Thank you for your answer though!
image : code captured in jupyter notebook
Here is my final code.
from urllib.request import urlopen
from bs4 import BeautifulSoup
company_list=[]
for n in range(12):
url = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(n+1)
webpage = urlopen(url)
source = BeautifulSoup(webpage,'html.parser',from_encoding='utf-8')
companys = source.findAll('strong',{'class':'company'})
for company in companys:
company_list.append(company.get_text().strip().replace('\n','').replace('\t','').replace('\r',''))
file = open('company_name1.txt','w',encoding='utf-8')
for company in company_list:
file.write(company+'\n')
file.close()

Unable to scrape name from google finance

I want to scrape name, url and description of companies as listed on google finance. So far I am successful in getting description and url but unable to fetch the name. In the source code of myUrl, name is 024 Pharma Inc. When I see the div, the class is named 'appbar-snippet-primary'. But still the code doesn't find it. I ma new to web scraping so may be I am missing something. Please guide me in this regard.
from bs4 import BeautifulSoup
import urllib
import csv
myUrl = 'https://www.google.com/finance?q=OTCMKTS%3AEEIG'
r = urllib.urlopen(myUrl).read()
soup = BeautifulSoup(r, 'html.parser')
name_box = soup.find('div', class_='appbar-snippet-primary') # !! This div is not found
#name = name_box.text
#print name
description = soup.find('div', class_='companySummary')
desc = description.text.strip()
#print desc
website = soup.find('div', class_='item')
site = website.text
#print site
from bs4 import BeautifulSoup
import requests
myUrl = 'https://www.google.com/finance?q=OTCMKTS%3AEEIG'
r = requests.get(myUrl).content
soup = BeautifulSoup(r, 'html.parser')
name = soup.find('title').text.split(':')[0] # !! This div is not found
#print name
description = soup.find('div', class_='companySummary')
desc = description.text.strip()
#print desc
website = soup.find('div', class_='item')
site = website.text
write soup.find_all() instead of soup.find()

Categories