Using beautiful soup to scrape data from indeed - python

i am trying to use bs to scrape resume on indeed but i met some problems
here is the sample site: https://www.indeed.com/resumes?q=java&l=&cb=jt
here is my code:
URL = "https://www.indeed.com/resumes?q=java&l=&cb=jt"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html.parser')
def scrape_job_title(soup):
job = []
for div in soup.find_all(name='li', attrs={'class':'sre'}):
for a in div.find_all(name='a', attrs={'class':'app-link'}):
job.append(a['title'])
return(job)
scrape_job_title(soup)
it print out nothing: []
As you can see in the picture, I want to grab the job title "Java developer".

The class is app_link, not app-link. Additionally, a['title'] doesn't do what you want. Use a.contents[0] instead.
URL = "https://www.indeed.com/resumes?q=java&l=&cb=jt"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html.parser')
def scrape_job_title(soup):
job = []
for div in soup.find_all(name='li', attrs={'class':'sre'}):
for a in div.find_all(name='a', attrs={'class':'app_link'}):
job.append(a.contents[0])
return(job)
scrape_job_title(soup)

Try this to get all the job titles:
import requests
from bs4 import BeautifulSoup
URL = "https://www.indeed.com/resumes?q=java&l=&cb=jt"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html5lib')
for items in soup.select('.sre'):
data = [item.text for item in items.select('.app_link')]
print(data)

Related

How do I make this web crawler print only the titles of the songs?

import requests
from bs4 import BeautifulSoup
url = 'https://www.officialcharts.com/charts/singles-chart'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a'):
print(link.get('href'))
def chart_spider(max_pages):
page = 1
while page >= max_pages:
url = "https://www.officialcharts.com/charts/singles-chart"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {"class": "title"}):
href = "BAD HABITS" + link.title(href)
print(href)
page += 1
chart_spider(1)
Wondering how to make this print just the titles of the songs instead of the entire page. I want it to go through the top 100 charts and print all the titles for now. Thanks
Here's is a possible solution, which modify your code as little as possible:
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
URL = 'https://www.officialcharts.com/charts/singles-chart'
def chart_spider():
source_code = requests.get(URL)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for title in soup.find_all('div', {"class": "title"}):
print(title.contents[1].string)
chart_spider()
The result is a list of all the titles found in the page, one per line.
If all you want is the titles for each song on the top 100,
this code:
import requests
from bs4 import BeautifulSoup
url='https://www.officialcharts.com/charts/singles-chart/'
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
titles = [i.text.replace('\n', '') for i in soup.find_all('div', class_="title")]
does what you are looking for.
You can do like this.
The Song title is present inside a <div> tag with class name as title.
Select all those <div> with .find_all(). This gives you a list of all <div> tags.
Iterate over the list and print the text of each div.
from bs4 import BeautifulSoup
import requests
url = 'https://www.officialcharts.com/charts/singles-chart/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
d = soup.find_all('div', class_='title')
for i in d:
print(i.text.strip())
Sample Output:
BAD HABITS
STAY
REMEMBER
BLACK MAGIC
VISITING HOURS
HAPPIER THAN EVER
INDUSTRY BABY
WASTED
.
.
.

Scraping websites with BS4

I have this code
import requests
from bs4 import BeautifulSoup
result = requests.get("http://www.cvbankas.lt/")
src = result.content
soup = BeautifulSoup(src, 'lxml')
urls = []
for article_tag in soup.find_all("article"):
a_tag = article_tag.find('a')
urls.append(a_tag.attrs['href'])
div_tag = article_tag.find('span')
urls.append(div_tag.attrs['class'])
print(urls)
Can anyone explane me how to get the data marked in red?
You can get span with the class label "salary_amount"
salary_object = article_tag.find("span", class_= "salary_amount")
and then extract the text with the .text attribute of the created object.

Following links with a second request - Web crawler

Please bear with me. I am quite new at Python - but having a lot of fun. I am trying to code a web crawler that crawls through results from a travel website. I have managed to extract all the relevant links from the main page. And now I want Python to follow each of the links and gather the pieces of information from each of those pages. But I am stuck. Hope you can give me a hint.
Here is my code:
import requests
from bs4 import BeautifulSoup
import urllib, collections
Spider =1
def trade_spider(max_pages):
RegionIDArray = {737: "London"}
for reg in RegionIDArray:
page = -1
r = requests.get("https://www.viatorcom.de/London/d" +str(reg) +"&page=" + str(page) , verify = False)
soup = BeautifulSoup(r.content, "lxml")
g_data = soup.find_all("h2", {"class": "mtm mbn card-title"})
for item in g_data:
Deeplink = item.find_all("a")
for t in set(t.get("href") for t in Deeplink):
Deeplink_final = t
print(Deeplink_final) #The output shows all the links that I would like to follow and gather information from.
trade_spider(1)
Output:
/de/7132/London-attractions/Stonehenge/d737-a113
/de/7132/London-attractions/Tower-of-London/d737-a93
/de/7132/London-attractions/London-Eye/d737-a1400
/de/7132/London-attractions/Thames-River/d737-a1410
The output shows all the links that I would like to follow and gather information from.
Next step in my code:
import requests
from bs4 import BeautifulSoup
import urllib, collections
Spider =1
def trade_spider(max_pages):
RegionIDArray = {737: "London"}
for reg in RegionIDArray:
page = -1
r = requests.get("https://www.viatorcom.de/London/d" +str(reg) +"&page=" + str(page) , verify = False)
soup = BeautifulSoup(r.content, "lxml")
g_data = soup.find_all("h2", {"class": "mtm mbn card-title"})
for item in g_data:
Deeplink = item.find_all("a")
for t in set(t.get("href") for t in Deeplink):
Deeplink_final = t
trade_spider(1)
def trade_spider2(max_pages):
r = requests.get("https://www.viatorcom.de" + Deeplink_final, verify = False)
soup = BeautifulSoup(r.content, "lxml")
print(soup)
trade_spider2(9)
I would like to append the initally crawled output to my second request. But this doesnt work.Hope you can give me a hint.
This should help.
import requests
from bs4 import BeautifulSoup
import urllib, collections
Spider =1
def trade_spider2(Deeplink_final):
r = requests.get("https://www.viatorcom.de" + Deeplink_final, verify = False)
soup = BeautifulSoup(r.content, "lxml")
print(soup)
def trade_spider(max_pages):
RegionIDArray = {737: "London"}
for reg in RegionIDArray:
page = -1
r = requests.get("https://www.viatorcom.de/London/d" +str(reg) +"&page=" + str(page) , verify = False)
soup = BeautifulSoup(r.content, "lxml")
g_data = soup.find_all("h2", {"class": "mtm mbn card-title"})
for item in g_data:
Deeplink = item.find_all("a")
for Deeplink_final in set(t.get("href") for t in Deeplink):
trade_spider2(Deeplink_final)
trade_spider(1)

how to scrape multiple pages from one site

I want to scrap multiple pages from one site.the pattern like this:
https://www.example.com/S1-3-1.html https://www.example.com/S1-3-2.html https://www.example.com/S1-3-3.html https://www.example.com/S1-3-4.html https://www.example.com/S1-3-5.html.
I tried three method to scrape all of these pages once, but every method only scrape the first page. I show the code below, and anyone can check and tell me what is the problem will be highly appreciated.
===============method 1====================
import requests
for i in range(5): # Number of pages plus one
url = "https://www.example.com/S1-3-{}.html".format(i)
r = requests.get(url)
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('div', attrs={'class':'product-item item-template-0 alternative'})
===============method 2=============
import urllib2,sys
from bs4 import BeautifulSoup
for numb in ('1', '5'):
address = ('https://www.example.com/S1-3-' + numb + '.html')
html = urllib2.urlopen(address).read()
soup = BeautifulSoup(html,'html.parser')
results = soup.find_all('div', attrs={'class':'product-item item-template-0 alternative'})
=============method 3==============
import requests
from bs4 import BeautifulSoup
url = 'https://www.example.com/S1-3-1.html'
for round in range(5):
res = requests.get(url)
soup = BeautifulSoup(res.text,'html.parser')
results = soup.find_all('div', attrs={'class':'product-item item-template-0 alternative'})
paging = soup.select('div.paging a')
next_url = 'https://www.example.com/'+paging[-1]['href'] # paging[-1]['href'] is next page button on the page
url = next_url
I checked some answers and checked, but it is not loop problem, please check image shown below,it is only first page results. it is really me annoyed several days
please see photo:only first page results,
results picture 2
Your indentation is out of order.
try(Method 1)
from bs4 import BeautifulSoup
import requests
for i in range(1, 6): # Number of pages plus one
url = "https://www.example.com/S1-3-{}.html".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('div', attrs={'class':'product-item item-template-0 alternative'})
Your page analysis should be inside the loop, like this, otherwise, it will only use one page:
.......
for i in range(5): # Number of pages plus one
url = "https://www.example.com/S1-3-{}.html".format(i)
r = requests.get(url)
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('div', attrs={'class':'product-item item-template-0 alternative'})
........
Firstly, you have to introduce all orders inside of the loop, otherwise, only will work with the last iteration.
Second,
You could try closing the requests session at the end of each iteration:
import requests
for i in range(5): # Number of pages plus one
url = "https://www.example.com/S1-3-{}.html".format(i)
r = requests.get(url)
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('div', attrs={'class':'product-item item-template-0 alternative'})
r.close()

Web crawler - following links

Please bear with me. I am quite new at Python - but having a lot of fun. I am trying to code a web crawler that crawls through election results from the last referendum in Denmark. I have managed to extract all the relevant links from the main page. And now I want Python to follow each of the 92 links and gather 9 pieces of information from each of those pages. But I am so stuck. Hope you can give me a hint.
Here is my code:
import requests
import urllib2
from bs4 import BeautifulSoup
# This is the original url http://www.kmdvalg.dk/
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
my_list = []
all_links = soup.find_all("a")
for link in all_links:
link2 = link["href"]
my_list.append(link2)
for i in my_list[1:93]:
print i
# The output shows all the links that I would like to follow and gather information from. How do I do that?
Here is my solution using lxml. It's similar to BeautifulSoup
import lxml
from lxml import html
import requests
page = requests.get('http://www.kmdvalg.dk/main')
tree = html.fromstring(page.content)
my_list = tree.xpath('//div[#class="LetterGroup"]//a/#href') # grab all link
print 'Length of all links = ', len(my_list)
my_list is a list consist of all links. And now you can use for loop to scrape information inside each page.
We can for loop through each links. Inside each page, you can extract information as example. This is only for the top table.
table_information = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
table_key = tree.xpath('//td[#class="statusHeader"]/text()')
table_value = tree.xpath('//td[#class="statusText"]/text()') + tree.xpath('//td[#class="statusText"]/a/text()')
table_information.append(zip([t]*len(table_key), table_key, table_value))
For table below the page,
table_information_below = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
l1 = tree.xpath('//tr[#class="tableRowPrimary"]/td[#class="StemmerNu"]/text()')
l2 = tree.xpath('//tr[#class="tableRowSecondary"]/td[#class="StemmerNu"]/text()')
table_information_below.append([t]+l1+l2)
Hope this help!
A simple approach would be to iterate through your list of urls and parse them each individually:
for url in my_list:
soup = BeautifulSoup(urllib2.urlopen(url).read())
# then parse each page individually here
Alternatively, you could speed things up significantly using Futures.
from requests_futures.sessions import FuturesSession
def my_parse_function(html):
"""Use this function to parse each page"""
soup = BeautifulSoup(html)
all_paragraphs = soup.find_all('p')
return all_paragraphs
session = FuturesSession(max_workers=5)
futures = [session.get(url) for url in my_list]
page_results = [my_parse_function(future.result()) for future in results]
This would be my solution for your problem
import requests
from bs4 import BeautifulSoup
def spider():
url = "http://www.kmdvalg.dk/main"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('div', {'class': 'LetterGroup'}):
anc = link.find('a')
href = anc.get('href')
print(anc.getText())
print(href)
# spider2(href) call a second function from here that is similar to this one(making url = to herf)
spider2(href)
print("\n")
def spider2(linktofollow):
url = linktofollow
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('tr', {'class': 'tableRowPrimary'}):
anc = link.find('td')
print(anc.getText())
print("\n")
spider()
its not done... i only get a simple element from the table but you get the idea and how its supposed to work.
Here is my final code that works smooth. Please let me know if I could have done it smarter!
import urllib2
from bs4 import BeautifulSoup
import codecs
f = codecs.open("eu2015valg.txt", "w", encoding="iso-8859-1")
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
liste = []
alle_links = soup.find_all("a")
for link in alle_links:
link2 = link["href"]
liste.append(link2)
for url in liste[1:93]:
soup = BeautifulSoup(urllib2.urlopen(url).read().decode('iso-8859-1'))
tds = soup.findAll('td')
stemmernu = soup.findAll('td', class_='StemmerNu')
print >> f, tds[5].string,";",tds[12].string,";",tds[14].string,";",tds[16].string,";", stemmernu[0].string,";",stemmernu[1].string,";",stemmernu[2].string,";",stemmernu[3].string,";",stemmernu[6].string,";",stemmernu[8].string,";",'\r\n'
f.close()

Categories