so I am trying to create a function that creates a list of multiple soups. I started with doing it in normal code (I do not know how to call this exactly:
list_url = ["http://www.facebook.com", "https://www.google.com", "http://www.yahoo.com"]
list_soup = []
for url in list_url:
soup = BeautifulSoup(requests.get(url).text, "html.parser")
list_soup.append(soup)
And this code works, but when I a function of this:
def get_multi_soup(list_url):
list_multi = []
for url in list_url:
soup = BeautifulSoup(requests.get(url).text, "html.parser")
list_multi.append(soup)
return list_multi
list_soup = get_multi_soup(list_url)
The code does not work as intended as it only gives one soup instead of three.
Can somebody explain why this does not work? The list_soup equals only one soup.
just move the return outside the for loop.
You are returning on the first iteration.
def get_multi_soup(list_url):
list_multi = []
for url in list_url:
soup = BeautifulSoup(requests.get(url).text, "html.parser")
list_multi.append(soup)
return list_multi
list_soup = get_multi_soup(list_url)
Should do the trick. :)
Related
I'm making a web crawler for a recipe website and I would like to get the link for a recipe then use that link to get the ingredients. I am able to do that, but only by manually entering the link to get the recipe. Is there a way to get the link then use this link to look at the ingredients. Also I will take any suggestions on how to make this code better!
def trade_spider():
url= 'https://tasty.co/topic/best-vegetarian'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'lxml')
for link in soup.find_all('a', {'class':'feed-item analyt-internal-link-subunit'}):
test = link.get('href')
print(test)
def ingredient_spider():
url1= 'https://tasty.co/recipe/peanut-butter-keto-cookies'
source_code1= requests.get(url1)
new_text= source_code1.text
soup1= BeautifulSoup(new_text, 'lxml')
for ingredients in soup1.find_all("li", {"class": "ingredient xs-mb1 xs-mt0"}):
print(ingredients.text)
To do this, ensure that the output of your is set to return rather than print (to understand the difference, try reading the top answer on this post: What is the formal difference between "print" and "return"?)
You can then use the output of the function as either a variable, or put the output directly into the next function.
For example
x = tradespider()
or
newFunction(tradespider())
You need to call the ingredient_spider function for every link you get from your recipe.
Using your example, it would look like this:
def trade_spider():
url= 'https://tasty.co/topic/best-vegetarian'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'lxml')
for link in soup.find_all('a', {'class':'feed-item analyt-internal-link-subunit'}):
test = link.get('href')
ingredient_spider(test)
def ingredient_spider(url):
source_code1= requests.get(url) #receive url from trade_spider function
new_text= source_code1.text
soup1= BeautifulSoup(new_text, 'lxml')
for ingredients in soup1.find_all("li", {"class": "ingredient xs-mb1 xs-mt0"}):
print(ingredients.text)
For each link you get from test = link.get('href'), you call the function ingredient_spider(), sending test variable as argument.
I am honestly not sure if I understand correctly what you are asking, but if I do you could go with something like this:
first create a list of URLs
second create a function that can work a url
last create a worker that works off that list peace by peace
.
def first():
URLs = []
...
for link in soup.find_all('a', {'class':'feed-item analyt-internal-link-subunit'}):
URLs.append(link.get('href'))
return URLs
def second(url):
source_code1= requests.get(url)
new_text= source_code1.text
soup1= BeautifulSoup(new_text, 'lxml')
for ingredients in soup1.find_all("li", {"class": "ingredient xs-mb1 xs-mt0"}):
return ingredients.text
def third(URL_LIST):
for URL in URL_LIST:
tmp = second(URL)
print(tmp)
URL_LIST = first()
third(URL_LIST)
I want to know why lists all_links and all_titles don't want to receive any records from lists titles and links. I have tried also .extend() method and it didn't help.
import requests
from bs4 import BeautifulSoup
all_links = []
all_titles = []
def title_link(page_num):
page = requests.get(
'https://www.gumtree.pl/s-mieszkania-i-domy-sprzedam-i-kupie/warszawa/page-%d/v%dc9073l3200008p%d'
% (page_num, page_num, page_num))
soup = BeautifulSoup(page.content, 'html.parser')
links = ['https://www.gumtree.pl' + link.get('href')
for link in soup.find_all('a', class_ ="href-link tile-title-text")]
titles = [flat.next_element for flat in soup.find_all('a', class_ = "href-link tile-title-text")]
print(titles)
for i in range(1,5+1):
title_link(i)
all_links = all_links + links
all_titles = all_titles + titles
i+=1
print(all_links)
import pandas as pd
df = pd.DataFrame(data = {'title': all_titles ,'link': all_links})
df.head(100)
#df.to_csv("./gumtree_page_1.csv", sep=';',index=False, encoding = 'utf-8')
#df.to_excel('./gumtree_page_1.xlsx')
When I ran your code, I got
NameError Traceback (most recent call last)
<ipython-input-3-6fff0b33d73b> in <module>
16 for i in range(1,5+1):
17 title_link(i)
---> 18 all_links = all_links + links
19 all_titles = all_titles + titles
20 i+=1
NameError: name 'links' is not defined
That points to a problem - variable named links is not defined in a global scope (where you add it to all_links). You can read about python scopes here. You'd need to return links and titles from title_link. Something similar to this:
def title_link(page_sum):
# your code here
return links, titles
for i in range(1,5+1):
links, titles = title_link(i)
all_links = all_links + links
all_titles = all_titles + titles
print(all_links)
This code is exhibits confusion about scoping. titles and links inside of title_link are local to that function. When the function ends, the data disappears and it cannot be accessed from another scope such as main. Use the return keyword to return values from functions. In this case, you'd need to return a tuple pair of titles and links like return titles, links.
Since functions should do one task only, having to return a pair shows reveals a possible design flaw. A function like title_link is overloaded and should probably be two separate functions, one to get titles and one to get links.
Having said that, the functions here seem like premature abstractions since the operations can be done directly.
Here's a suggested rewrite:
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "https://www.gumtree.pl/s-mieszkania-i-domy-sprzedam-i-kupie/warszawa/page-%d/v%dc9073l3200008p%d"
data = {"title": [], "link": []}
for i in range(1, 6):
page = requests.get(url % (i, i, i))
soup = BeautifulSoup(page.content, "html.parser")
titles = soup.find_all("a", class_="href-link tile-title-text")
data["title"].extend([x.next_element for x in titles])
data["link"].extend("https://www.gumtree.pl" + x.get("href") for x in titles)
df = pd.DataFrame(data)
print(df.head(100))
Other remarks:
i+=1 is unnecessary; for loops move forward automatically in Python.
(1,5+1) is clearer as (1, 6).
List comprehensions are great, but if they run multiple lines, consider writing them as normal loops or creating an intermediate variable or two.
Imports should be at the top of a file only. See PEP-8.
list.extend(other_list) is preferable to list = list + other_list, which is slow and memory-intensive, creating a whole copy of the list.
Try this:
import requests
from bs4 import BeautifulSoup
all_links = []
all_titles = []
def title_link(page_num):
page = requests.get(
'https://www.gumtree.pl/s-mieszkania-i-domy-sprzedam-i-kupie/warszawa/page-%d/v%dc9073l3200008p%d'
% (page_num, page_num, page_num))
page.encoding = 'utf-8'
soup = BeautifulSoup(page.content, 'html.parser', from_encoding='utf-8')
links = ['https://www.gumtree.pl' + link.get('href')
for link in soup.find_all('a', class_ ="href-link tile-title-text")]
titles = [flat.next_element for flat in soup.find_all('a', class_ = "href-link tile-title-text")]
print(titles)
return links, titles
for i in range(1,5+1):
links, titles = title_link(i)
all_links.extend(links)
all_titles.extend(titles)
# i+=1 not needed in python
print(all_links)
import pandas as pd
df = pd.DataFrame(data = {'title': all_titles ,'link': all_links})
df.head(100)
I think you just needed to get links and titles out of title_link(page_num).
Edit: removed the manual incrementing per comments
Edit: changed the all_links = all_links + links to all_links.extend(links)
Edit: website is utf-8 encoded, added page.encoding = 'utf-8' and as extra (probably unnecessary) measure, from_encoding='utf-8' to the BeautifulSoup
How do I get hrefs from hrefs using Python in class and method format?
I have tried:
root_url = 'https://www.iea.org'
class IEAData:
def __init__(self):
try:--
except:
def get_links(self, url):
all_links = []
page = requests.get(root_url)
soup = BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href'))
return all_links
#print(all_links)
iea_obj = IEAData()
yearLinks = iea_obj.get_links(root_url + '/oilmarketreport/reports/')
reportLinks = []
for url in yearLinks:
links =iea_obj.get_links(yearLinks)
print(links)
Recommended: links variable must have all month hrefs but not getting, so please tell me how I should do it.
There were a couple of issues with your code. Your get_links() function was not using the url that was passed to it. When looping over the returned links, you were passing yearLinks rather than the url.
The following should get you going:
from bs4 import BeautifulSoup
import requests
root_url = 'https://www.iea.org'
class IEAData:
def get_links(self, url):
all_links = []
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
for li in soup.find_all(class_='omrlist'):
all_links.append(root_url + li.find('a').get('href'))
return all_links
iea_obj = IEAData()
yearLinks = iea_obj.get_links(root_url + '/oilmarketreport/reports/')
for url in yearLinks:
links = iea_obj.get_links(url)
print(url, links)
This would give you output starting:
https://www.iea.org/oilmarketreport/reports/2018/ ['https://www.iea.org/oilmarketreport/reports/2018/0118/', 'https://www.iea.org/oilmarketreport/reports/2018/0218/', 'https://www.iea.org/oilmarketreport/reports/2018/0318/', 'https://www.iea.org/oilmarketreport/reports/2018/0418/', 'https://www.iea.org/oilmarketreport/reports/2018/0518/', 'https://www.iea.org/oilmarketreport/reports/2018/0618/', 'https://www.iea.org/oilmarketreport/reports/2018/0718/', 'https://www.iea.org/oilmarketreport/reports/2018/0818/', 'https://www.iea.org/oilmarketreport/reports/2018/1018/']
https://www.iea.org/oilmarketreport/reports/2017/ ['https://www.iea.org/oilmarketreport/reports/2017/0117/', 'https://www.iea.org/oilmarketreport/reports/2017/0217/', 'https://www.iea.org/oilmarketreport/reports/2017/0317/', 'https://www.iea.org/oilmarketreport/reports/2017/0417/', 'https://www.iea.org/oilmarketreport/reports/2017/0517/', 'https://www.iea.org/oilmarketreport/reports/2017/0617/', 'https://www.iea.org/oilmarketreport/reports/2017/0717/', 'https://www.iea.org/oilmarketreport/reports/2017/0817/', 'https://www.iea.org/oilmarketreport/reports/2017/0917/', 'https://www.iea.org/oilmarketreport/reports/2017/1017/', 'https://www.iea.org/oilmarketreport/reports/2017/1117/', 'https://www.iea.org/oilmarketreport/reports/2017/1217/']
I'm fairly new to programming, and I'm still learning and trying to understand how classes and whatnot all work together. But gave it a shot (that's how we learn, right?)
Not sure if this is what you're looking for as your output. I changed 2 things and was able to put all the links from within the yearLinks into a list. Note that it'll also include the PDF links as well as the months links that I think you wanted. If you don't want those PDF links, and exclusively the months, then just don't include the pdf.
So here's the code I did it with, and maybe you can use that to fit into how you have it structured.
root_url = 'https://www.iea.org'
class IEAData:
def get_links(self, url):
all_links = []
page = requests.get(url)
soup = bs4.BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href'))
return all_links
#print(all_links)
iea_obj = IEAData()
yearLinks = iea_obj.get_links(root_url + '/oilmarketreport/reports/')
reportLinks = []
for url in yearLinks:
links = iea_obj.get_links(url)
# uncomment line below if you do not want the .pdf links
#links = [ x for x in links if ".pdf" not in x ]
reportLinks += links
I am trying to get all the unique urls of the website by calling the all_pages function recursively but this function is not giving all the urls of the website.
All I want to do is get all the unique urls of the website using BeautifulSoup. My code looks like this:
base_url = "http://www.readings.com.pk/"
unique_urls=[]
def all_pages(base_url,unique_urls=[]):
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")
for link in soup.find_all("a"):
url = link["href"]
absolute_url = urljoin(base_url, url)
if absolute_url not in unique_urls:
if base_url in absolute_url:
unique_urls.append(absolute_url)
print (absolute_url)
all_pages(absolute_url,unique_urls,book_urls)
all_pages(base_url,unique_urls)
Use response.text instead of response.content
Also, you need to return at some point. Additionally, instead of making unique_urls a list, make it a set and they will always be unique.
Additionally, your method is recursive and python has a max recursion depth, so maybe you should instead do this:
base_url = "http://www.readings.com.pk/"
def all_pages(base_url):
response = requests.get(base_url)
unique_urls = {base_url}
visited_urls = set()
while len(unique_urls) > len(visited_urls)
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.find_all("a"):
try:
url = link["href"]
except:
continue
absolute_url = base_url + url
unique_urls.add(absolute_url)
unvisited_url = (unique_urls - visited_urls).pop()
visited_urls.add(unvisited_url)
response = requests.get(unvisited_url)
return unique_urls
all_pages(base_url)
Please bear with me. I am quite new at Python - but having a lot of fun. I am trying to code a web crawler that crawls through election results from the last referendum in Denmark. I have managed to extract all the relevant links from the main page. And now I want Python to follow each of the 92 links and gather 9 pieces of information from each of those pages. But I am so stuck. Hope you can give me a hint.
Here is my code:
import requests
import urllib2
from bs4 import BeautifulSoup
# This is the original url http://www.kmdvalg.dk/
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
my_list = []
all_links = soup.find_all("a")
for link in all_links:
link2 = link["href"]
my_list.append(link2)
for i in my_list[1:93]:
print i
# The output shows all the links that I would like to follow and gather information from. How do I do that?
Here is my solution using lxml. It's similar to BeautifulSoup
import lxml
from lxml import html
import requests
page = requests.get('http://www.kmdvalg.dk/main')
tree = html.fromstring(page.content)
my_list = tree.xpath('//div[#class="LetterGroup"]//a/#href') # grab all link
print 'Length of all links = ', len(my_list)
my_list is a list consist of all links. And now you can use for loop to scrape information inside each page.
We can for loop through each links. Inside each page, you can extract information as example. This is only for the top table.
table_information = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
table_key = tree.xpath('//td[#class="statusHeader"]/text()')
table_value = tree.xpath('//td[#class="statusText"]/text()') + tree.xpath('//td[#class="statusText"]/a/text()')
table_information.append(zip([t]*len(table_key), table_key, table_value))
For table below the page,
table_information_below = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
l1 = tree.xpath('//tr[#class="tableRowPrimary"]/td[#class="StemmerNu"]/text()')
l2 = tree.xpath('//tr[#class="tableRowSecondary"]/td[#class="StemmerNu"]/text()')
table_information_below.append([t]+l1+l2)
Hope this help!
A simple approach would be to iterate through your list of urls and parse them each individually:
for url in my_list:
soup = BeautifulSoup(urllib2.urlopen(url).read())
# then parse each page individually here
Alternatively, you could speed things up significantly using Futures.
from requests_futures.sessions import FuturesSession
def my_parse_function(html):
"""Use this function to parse each page"""
soup = BeautifulSoup(html)
all_paragraphs = soup.find_all('p')
return all_paragraphs
session = FuturesSession(max_workers=5)
futures = [session.get(url) for url in my_list]
page_results = [my_parse_function(future.result()) for future in results]
This would be my solution for your problem
import requests
from bs4 import BeautifulSoup
def spider():
url = "http://www.kmdvalg.dk/main"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('div', {'class': 'LetterGroup'}):
anc = link.find('a')
href = anc.get('href')
print(anc.getText())
print(href)
# spider2(href) call a second function from here that is similar to this one(making url = to herf)
spider2(href)
print("\n")
def spider2(linktofollow):
url = linktofollow
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('tr', {'class': 'tableRowPrimary'}):
anc = link.find('td')
print(anc.getText())
print("\n")
spider()
its not done... i only get a simple element from the table but you get the idea and how its supposed to work.
Here is my final code that works smooth. Please let me know if I could have done it smarter!
import urllib2
from bs4 import BeautifulSoup
import codecs
f = codecs.open("eu2015valg.txt", "w", encoding="iso-8859-1")
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
liste = []
alle_links = soup.find_all("a")
for link in alle_links:
link2 = link["href"]
liste.append(link2)
for url in liste[1:93]:
soup = BeautifulSoup(urllib2.urlopen(url).read().decode('iso-8859-1'))
tds = soup.findAll('td')
stemmernu = soup.findAll('td', class_='StemmerNu')
print >> f, tds[5].string,";",tds[12].string,";",tds[14].string,";",tds[16].string,";", stemmernu[0].string,";",stemmernu[1].string,";",stemmernu[2].string,";",stemmernu[3].string,";",stemmernu[6].string,";",stemmernu[8].string,";",'\r\n'
f.close()