Web crawler - following links

Web crawler - following links - python

Please bear with me. I am quite new at Python - but having a lot of fun. I am trying to code a web crawler that crawls through election results from the last referendum in Denmark. I have managed to extract all the relevant links from the main page. And now I want Python to follow each of the 92 links and gather 9 pieces of information from each of those pages. But I am so stuck. Hope you can give me a hint.
Here is my code:
import requests
import urllib2
from bs4 import BeautifulSoup
# This is the original url http://www.kmdvalg.dk/
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
my_list = []
all_links = soup.find_all("a")
for link in all_links:
link2 = link["href"]
my_list.append(link2)
for i in my_list[1:93]:
print i
# The output shows all the links that I would like to follow and gather information from. How do I do that?

Here is my solution using lxml. It's similar to BeautifulSoup
import lxml
from lxml import html
import requests
page = requests.get('http://www.kmdvalg.dk/main')
tree = html.fromstring(page.content)
my_list = tree.xpath('//div[#class="LetterGroup"]//a/#href') # grab all link
print 'Length of all links = ', len(my_list)
my_list is a list consist of all links. And now you can use for loop to scrape information inside each page.
We can for loop through each links. Inside each page, you can extract information as example. This is only for the top table.
table_information = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
table_key = tree.xpath('//td[#class="statusHeader"]/text()')
table_value = tree.xpath('//td[#class="statusText"]/text()') + tree.xpath('//td[#class="statusText"]/a/text()')
table_information.append(zip([t]*len(table_key), table_key, table_value))
For table below the page,
table_information_below = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
l1 = tree.xpath('//tr[#class="tableRowPrimary"]/td[#class="StemmerNu"]/text()')
l2 = tree.xpath('//tr[#class="tableRowSecondary"]/td[#class="StemmerNu"]/text()')
table_information_below.append([t]+l1+l2)
Hope this help!

A simple approach would be to iterate through your list of urls and parse them each individually:
for url in my_list:
soup = BeautifulSoup(urllib2.urlopen(url).read())
# then parse each page individually here
Alternatively, you could speed things up significantly using Futures.
from requests_futures.sessions import FuturesSession
def my_parse_function(html):
"""Use this function to parse each page"""
soup = BeautifulSoup(html)
all_paragraphs = soup.find_all('p')
return all_paragraphs
session = FuturesSession(max_workers=5)
futures = [session.get(url) for url in my_list]
page_results = [my_parse_function(future.result()) for future in results]

This would be my solution for your problem
import requests
from bs4 import BeautifulSoup
def spider():
url = "http://www.kmdvalg.dk/main"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('div', {'class': 'LetterGroup'}):
anc = link.find('a')
href = anc.get('href')
print(anc.getText())
print(href)
# spider2(href) call a second function from here that is similar to this one(making url = to herf)
spider2(href)
print("\n")
def spider2(linktofollow):
url = linktofollow
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('tr', {'class': 'tableRowPrimary'}):
anc = link.find('td')
print(anc.getText())
print("\n")
spider()
its not done... i only get a simple element from the table but you get the idea and how its supposed to work.

Here is my final code that works smooth. Please let me know if I could have done it smarter!
import urllib2
from bs4 import BeautifulSoup
import codecs
f = codecs.open("eu2015valg.txt", "w", encoding="iso-8859-1")
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
liste = []
alle_links = soup.find_all("a")
for link in alle_links:
link2 = link["href"]
liste.append(link2)
for url in liste[1:93]:
soup = BeautifulSoup(urllib2.urlopen(url).read().decode('iso-8859-1'))
tds = soup.findAll('td')
stemmernu = soup.findAll('td', class_='StemmerNu')
print >> f, tds[5].string,";",tds[12].string,";",tds[14].string,";",tds[16].string,";", stemmernu[0].string,";",stemmernu[1].string,";",stemmernu[2].string,";",stemmernu[3].string,";",stemmernu[6].string,";",stemmernu[8].string,";",'\r\n'
f.close()

Related

Pulling p tags from multiple URLs

I've struggled on this for days and not sure what the issue could be - basically, I'm trying to extract the profile box data (picture below) of each link -- going through inspector, I thought I could pull the p tags and do so.
I'm new to this and trying to understand, but here's what I have thus far:
-- a code that (somewhat) succesfully pulls the info for ONE link:
import requests
from bs4 import BeautifulSoup
# getting html
url = 'https://basketball.realgm.com/player/Darius-Adams/Summary/28720'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
playerinfo = container.find_all('p')
print(playerinfo)
I then also have a code that pulls all of the HREF tags from multiple links:
from bs4 import BeautifulSoup
import requests
def get_links(url):
links = []
website = requests.get(url)
website_text = website.text
soup = BeautifulSoup(website_text)
for link in soup.find_all('a'):
links.append(link.get('href'))
for link in links:
print(link)
print(len(links))
get_links('https://basketball.realgm.com/dleague/players/2022')
get_links('https://basketball.realgm.com/dleague/players/2021')
get_links('https://basketball.realgm.com/dleague/players/2020')
So basically, my goal is to combine these two, and get one code that will pull all of the P tags from multiple URLs. I've been trying to do it, and I'm really not sure at all why this isn't working here:
from bs4 import BeautifulSoup
import requests
def get_profile(url):
profiles = []
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
for profile in container.find_all('a'):
profiles.append(profile.get('p'))
for profile in profiles:
print(profile)
get_profile('https://basketball.realgm.com/player/Darius-Adams/Summary/28720')
get_profile('https://basketball.realgm.com/player/Marial-Shayok/Summary/26697')
Again, I'm really new to web scraping with Python but any advice would be greatly appreciated. Ultimately, my end goal is to have a tool that can scrape this data in a clean way all at once.
(Player name, Current Team, Born, Birthplace, etc).. maybe I'm doing it entirely wrong but any guidance is welcome!

You need to combine your two scripts together and make requests for each player. Try the following approach. This searches for <td> tags that have the data-td=Player attribute:
import requests
from bs4 import BeautifulSoup
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th' : 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name" : name, "URL" : player_url}
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
]
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
for entry in data:
print(entry)

Iterating over urls fails to find correct href in Python using BeautifulSoup

I am iterating through the website in the code. The following is what my code does. Loops through the 52 pages and gets the link to each URLs.
Then it iterates through those URLs and tries to get the link for the English Translation. if you see the Mongolian website, it has a section "Орчуулга" on the top right and it has "English" underneath - that is the link to the English translation.
However, my code fails to grab the link for the english translation and gives a wrong url.
Below is a sample output for the first article.
1
{'https://mn.usembassy.gov/mn/2020-naadam-mn/': 'https://mn.usembassy.gov/mn/sitemap-mn/'}
The expected output for the first page should be
1
{'https://mn.usembassy.gov/mn/2020-naadam-mn/': 'https://mn.usembassy.gov/2020-naadam/'}
Below is my code
import requests
from bs4 import BeautifulSoup
url = 'https://mn.usembassy.gov/mn/news-events-mn/page/{page}/'
urls = []
for page in range(1, 53):
print(str(page) + "/52")
soup = BeautifulSoup(requests.get(url.format(page=page)).content, 'html.parser')
for h in soup.find_all('h2'):
a = h.find('a')
urls.append(a.attrs['href'])
print(urls)
i = 0
bilingual_dict = {}
for url in urls:
i += 1
print(i)
soup = BeautifulSoup(requests.get(url.format(page=url)).content, 'html.parser')
for div in soup.find_all('div', class_='translations_sidebar'):
for ul in soup.find_all('ul'):
for li in ul.find_all('li'):
a = li.find('a')
bilingual_dict[url] = a['href']
print(bilingual_dict)
print(bilingual_dict)

This script will print link to english translation:
import requests
from bs4 import BeautifulSoup
url = 'https://mn.usembassy.gov/mn/2020-naadam-mn/'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
link = soup.select_one('a[hreflang="en"]')
print(link['href'])
Prints:
https://mn.usembassy.gov/2020-naadam/
Complete code: (Where there isn't link to english translation, the value is set to None)
import requests
from bs4 import BeautifulSoup
from pprint import pprint
url = 'https://mn.usembassy.gov/mn/news-events-mn/page/{page}/'
urls = []
for page in range(1, 53):
print('Page {}...'.format(page))
soup = BeautifulSoup(requests.get(url.format(page=page)).content, 'html.parser')
for h in soup.find_all('h2'):
a = h.find('a')
urls.append(a.attrs['href'])
pprint(urls)
bilingual_dict = {}
for url in urls:
print(url)
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
link = soup.select_one('a[hreflang="en"]')
bilingual_dict[url] = link['href'] if link else None
pprint(bilingual_dict)

How can I get the correct urls of ads?

I'm trying to scrape the urls of the ads on "Marktplaats" website (link is provided below).
As you can see I'm looking for 30 URLs. These URLs are placed inside a 'href' field and all start with "/a/auto-s/". Unfortunately, I only keep getting the first few URLs. I found out that on this sites all the data is places within "<li class = "mp-Listing mp-Listing--list-item"> ... </li>". Does anyone have an idea how to fix it? (you can see that you won't find all the URLs of the ads when you run my code)
Link:
https://www.marktplaats.nl/l/auto-s/#f:10882,10898|PriceCentsTo:350000|constructionYearFrom:2001|offeredSince:TODAY|searchInTitleAndDescription:true
My code:
import requests
from bs4 import BeautifulSoup
url = "https://www.marktplaats.nl/l/auto-s/#f:10882,10898|PriceCentsTo:350000|constructionYearFrom:2001|offeredSince:TODAY|searchInTitleAndDescription:true"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
url_list = soup.find_all(class_ = 'mp-Listing mp-Listing--list-item')
print(url_list)

You can try something like this:
import requests
from bs4 import BeautifulSoup
def parse_links(url):
links = []
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for li in soup.find_all(class_="mp-Listing mp-Listing--list-item"):
links.append(li.a.get('href'))
return links
url = "https://www.marktplaats.nl/l/auto-s/#f:10882,10898|PriceCentsTo:350000|constructionYearFrom:2001|offeredSince:TODAY|searchInTitleAndDescription:true"
links = parse_links(url)
print('\n'.join(map(str, links)))
Output
/a/auto-s/oldtimers/a1302359148-allis-chalmers-ed40-1965.html
/a/auto-s/bestelauto-s/a1258166221-opel-movano-2-3-cdti-96kw-2018.html
/a/auto-s/oldtimers/a1302359184-chevrolet-biscayne-bel-air-1960.html
/a/auto-s/renault/a1240974413-ruim-aanbod-rolstoelauto-s-www-autoland-nl.html
/a/auto-s/volkswagen/m1457703674-golf-6-1-2tsi-comfortline-bluemotion-77kw-2de-eigenaar.html
/a/auto-s/peugeot/m1457564187-peugeot-208-1-6-e-hdi-68kw-92pk-5-d-2014-zwart.html
/a/auto-s/volkswagen/m1457124365-volkswagen-touareg-3-2-v6-177kw-4motion-aut-2004-grijs.html
/a/auto-s/volkswagen/m1456753596-volkswagen-golf-vii-2-0-tdi-highline-150pk-xenon-trekhaak.html
/a/auto-s/bestelauto-s/a1001658686-200-nw-en-gebruikte-bestelwagens-personenbusjes-pick-ups.html
/a/auto-s/bestelauto-s/m940111355-bus-verkopen-bestelauto-inkoop-bestelwagen-opkoper-rdw.html
/a/auto-s/volkswagen/m1456401063-volkswagen-golf-1-6-74kw-2000-zwart.html
/a/auto-s/renault/m1456242548-renault-espace-2-0-dci-110kw-e4-2006-zwart.html
/a/auto-s/nissan/m1448699345-nissan-qashqai-1-5-dci-connect-2011-grijs-panoramadak.html
/a/auto-s/bestelauto-s/a1212708374-70-x-kleine-bestelwagens-lage-km-scherpe-prijzen.html
/a/auto-s/bmw/m1452641019-bmw-5-serie-2-0-520d-touring-aut-2014-grijs.html
/a/auto-s/mercedes-benz/m1448671698-mercedes-benz-a-klasse-a250-amg-224pk-7g-dct-panoramadak-wid.html
/a/auto-s/bmw/m1455671862-bmw-3-serie-2-0-i-320-cabrio-aut-2007-bruin.html
/a/auto-s/bestelauto-s/m1455562699-volkswagen-transporter-kmstand-151-534-2-5-tdi-65kw-2002.html
/a/auto-s/bestelauto-s/a1295698562-35-x-renault-kangoo-2013-t-m-2015-v-a-25000-km.html
/a/auto-s/infiniti/m1458111256-infiniti-q50-3-5-hybrid-awd-2016-grijs.html
/a/auto-s/ford/m1458111166-ford-ka-1-3-i-44kw-2007-zwart.html
/a/auto-s/bestelauto-s/m1457499260-renault-master-l3h2-2018-airco-camera-cruise-laadruimte-12.html
/a/auto-s/land-rover/m1458110209-land-rover-discovery-4-3-0-tdv6-2010-grijs.html
/a/auto-s/dodge/a1279463634-5-jaar-ram-dealer-garantie-lage-bijtelling.html
/a/auto-s/bmw/m1455389317-bmw-320i-e46-sedan-bieden.html
/a/auto-s/ford/m1457306473-ford-galaxy-2-0-tdci-85kw-dpf-2011-blauw.html
/a/auto-s/peugeot/m1456912876-peugeot-407-2-0-16v-sw-2006-grijs.html
/a/auto-s/bestelauto-s/m1457161395-renault-master-t35-2-3-dci-l3h2-130-pk-navi-airco-camera-pdc.html
/a/auto-s/bestelauto-s/a1299134880-citroen-berlingo-1-6-hdi-2017-airco-sd-3-zits-v-a-179-p-m.html
/a/auto-s/hyundai/m1458105451-hyundai-atos-gezocht-hoge-prijs-tel-0653222206.html
/a/auto-s/volkswagen/m1458103618-volkswagen-polo-1-4-tsi-132kw-dsg-2012-wit.html
/a/auto-s/vrachtwagens/m1458101965-scania-torpedo.html
/a/auto-s/toyota/m1458101624-toyota-yaris-1-0-12v-vvt-i-aspiration-5dr-2012.html
/a/auto-s/dodge/a1279447576-5-jaar-ram-dealer-garantie-en-historie-bekijk-onze-website.html
You can also build the actual url of the page by appending 'https://www.marktplaats.nl' to li.a.get('href'). So, your whole code should look like this:
import requests
from bs4 import BeautifulSoup
def parse_links(url):
links = []
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for li in soup.find_all(class_="mp-Listing mp-Listing--list-item"):
links.append('https://www.marktplaats.nl' + li.a.get('href'))
return links
url = "https://www.marktplaats.nl/l/auto-s/#f:10882,10898|PriceCentsTo:350000|constructionYearFrom:2001|offeredSince:TODAY|searchInTitleAndDescription:true"
links = parse_links(url)
print('\n'.join(map(str, links)))
It should produce the output like this:
https://www.marktplaats.nl/a/auto-s/renault/a1302508082-mooi-renault-megane-scenic-1-6-16v-aut-2005-2003-groen-airco.html
https://www.marktplaats.nl/a/auto-s/oldtimers/a1302359157-morris-minor-cabriolet-1970.html
https://www.marktplaats.nl/a/auto-s/oldtimers/a1302743902-online-veiling-oldtimers-en-classic-cars-zedelgem-vavato.html
https://www.marktplaats.nl/a/auto-s/oldtimers/a1302359138-mercedes-benz-g-500-guard-pantzer-1999.html
https://www.marktplaats.nl/a/auto-s/volkswagen/m1457703674-golf-6-1-2tsi-comfortline-bluemotion-77kw-2de-eigenaar.html
https://www.marktplaats.nl/a/auto-s/peugeot/m1457564187-peugeot-208-1-6-e-hdi-68kw-92pk-5-d-2014-zwart.html
https://www.marktplaats.nl/a/auto-s/volkswagen/m1457124365-volkswagen-touareg-3-2-v6-177kw-4motion-aut-2004-grijs.html
https://www.marktplaats.nl/a/auto-s/volkswagen/m1456753596-volkswagen-golf-vii-2-0-tdi-highline-150pk-xenon-trekhaak.html
https://www.marktplaats.nl/a/auto-s/volkswagen/a1279696849-vw-take-up-5-d-radio-airco-private-lease.html
https://www.marktplaats.nl/a/auto-s/bestelauto-s/m940111355-bus-verkopen-bestelauto-inkoop-bestelwagen-opkoper-rdw.html
https://www.marktplaats.nl/a/auto-s/volkswagen/m1456401063-volkswagen-golf-1-6-74kw-2000-zwart.html
https://www.marktplaats.nl/a/auto-s/renault/m1456242548-renault-espace-2-0-dci-110kw-e4-2006-zwart.html
https://www.marktplaats.nl/a/auto-s/nissan/m1448699345-nissan-qashqai-1-5-dci-connect-2011-grijs-panoramadak.html
https://www.marktplaats.nl/a/auto-s/citroen/a1277007710-citroen-c1-feel-5-d-airco-private-lease-vanaf-189-euro-mnd.html
https://www.marktplaats.nl/a/auto-s/bmw/m1452641019-bmw-5-serie-2-0-520d-touring-aut-2014-grijs.html
https://www.marktplaats.nl/a/auto-s/mercedes-benz/m1448671698-mercedes-benz-a-klasse-a250-amg-224pk-7g-dct-panoramadak-wid.html
https://www.marktplaats.nl/a/auto-s/bmw/m1455671862-bmw-3-serie-2-0-i-320-cabrio-aut-2007-bruin.html
https://www.marktplaats.nl/a/auto-s/bestelauto-s/m1455562699-volkswagen-transporter-kmstand-151-534-2-5-tdi-65kw-2002.html
https://www.marktplaats.nl/a/auto-s/peugeot/a1298813052-private-lease-occasion-outlet-prive-lease.html
https://www.marktplaats.nl/a/auto-s/audi/m1458114563-audi-a4-2-0-tfsi-132kw-avant-multitronic-nl-auto.html
https://www.marktplaats.nl/a/auto-s/mercedes-benz/m1452983872-mercedes-a-klasse-2-0-cdi-a200-5drs-aut-2007-grijs.html
https://www.marktplaats.nl/a/auto-s/bestelauto-s/m1457499260-renault-master-l3h2-2018-airco-camera-cruise-laadruimte-12.html
https://www.marktplaats.nl/a/auto-s/infiniti/m1458111256-infiniti-q50-3-5-hybrid-awd-2016-grijs.html
https://www.marktplaats.nl/a/auto-s/bestelauto-s/a1001658686-200-nw-en-gebruikte-bestelwagens-personenbusjes-pick-ups.html
https://www.marktplaats.nl/a/auto-s/ford/m1458111166-ford-ka-1-3-i-44kw-2007-zwart.html
https://www.marktplaats.nl/a/auto-s/land-rover/m1458110209-land-rover-discovery-4-3-0-tdv6-2010-grijs.html
https://www.marktplaats.nl/a/auto-s/bmw/m1455389317-bmw-320i-e46-sedan-bieden.html
https://www.marktplaats.nl/a/auto-s/bestelauto-s/m1457161395-renault-master-t35-2-3-dci-l3h2-130-pk-navi-airco-camera-pdc.html
https://www.marktplaats.nl/a/auto-s/renault/a1302508082-mooi-renault-megane-scenic-1-6-16v-aut-2005-2003-groen-airco.html
https://www.marktplaats.nl/a/auto-s/ford/m1457306473-ford-galaxy-2-0-tdci-85kw-dpf-2011-blauw.html
https://www.marktplaats.nl/a/auto-s/peugeot/m1456912876-peugeot-407-2-0-16v-sw-2006-grijs.html
https://www.marktplaats.nl/a/auto-s/hyundai/m1458105451-hyundai-atos-gezocht-hoge-prijs-tel-0653222206.html
https://www.marktplaats.nl/a/auto-s/volkswagen/m1458103618-volkswagen-polo-1-4-tsi-132kw-dsg-2012-wit.html
https://www.marktplaats.nl/a/auto-s/oldtimers/a1302743902-online-veiling-oldtimers-en-classic-cars-zedelgem-vavato.html
Good luck!

Web Scraping through Python BeautifulSoup

I am just a beginner at Python.
I am trying to scrape data from a site and have managed to write the below code.
However, I am not sure how to proceed ahead as I am unable to get the href tags so that I can go to each listing & get the data. I am also not very well aware of HTML Tags, so I suspect that I have not identified the tags properly.
Here is my code :
import requests
from bs4 import BeautifulSoup
urls = []
for i in range(1,5):
pages = "https://directory.singaporefintech.org/?p={0}&category=0&zoom=15&is_mile=0&directory_radius=0&view=list&hide_searchbox=0&hide_nav=0&hide_nav_views=0&hide_pager=0&featured_only=0&feature=1&perpage=20&sort=random".format(i)
urls.append(pages)
Data = []
for info in urls:
page = requests.get(info)
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('a', attrs ={'class' :'sabai-directory-title'})
hrefs = [link['href'] for link in links]
The above code is producing hrefs as a blank list.
Any help would be highly appreciated!!
Thanks!!!

Code is fine, the class that you're looking for just doesn't exist on those pages. For example, substituted sabai-directory-title class with comment-reply-link after inspecting https://directory.singaporefintech.org/hello-world/?category=0&zoom=15&is_mile=0&directory_radius=0&view=list&hide_searchbox=0&hide_nav=0&hide_nav_views=0&hide_pager=0&featured_only=0&feature=1&perpage=20&sort=random and got results when i added print statements

You can scrap links using CSS selector. Selector div.sabai-directory-title a will find any <a> tags inside <div> tag with class sabai-directory-title (I updated the URL, yours was giving me error pages):
from bs4 import BeautifulSoup
import requests
from pprint import pprint
r = requests.get('https://directory.singaporefintech.org/')
soup = BeautifulSoup(r.text, 'lxml')
hrefs = [a['href'] for a in soup.select('div.sabai-directory-title a')]
pprint(hrefs)
This will print:
['https://directory.singaporefintech.org/directory/listing/silent-eight',
'https://directory.singaporefintech.org/directory/listing/incomlend',
'https://directory.singaporefintech.org/directory/listing/bizgrow',
'https://directory.singaporefintech.org/directory/listing/makerscut',
'https://directory.singaporefintech.org/directory/listing/soho-fintech',
'https://directory.singaporefintech.org/directory/listing/dxmarkets',
'https://directory.singaporefintech.org/directory/listing/fundrevo',
'https://directory.singaporefintech.org/directory/listing/money4money',
'https://directory.singaporefintech.org/directory/listing/onelyst',
'https://directory.singaporefintech.org/directory/listing/hearti-lab',
'https://directory.singaporefintech.org/directory/listing/samurai-fintech-singapore-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/ceo-1',
'https://directory.singaporefintech.org/directory/listing/arcadier',
'https://directory.singaporefintech.org/directory/listing/plmp-fintech-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/cash-in-asia',
'https://directory.singaporefintech.org/directory/listing/grc-systems',
'https://directory.singaporefintech.org/directory/listing/sendexpense',
'https://directory.singaporefintech.org/directory/listing/jinjerjade',
'https://directory.singaporefintech.org/directory/listing/hatcher',
'https://directory.singaporefintech.org/directory/listing/fintech-consortium']

Hi I have made few changes to code:
import requests
from bs4 import BeautifulSoup
from pprint import pprint
urls = []
for i in range(1,5):
pages = "https://directory.singaporefintech.org"
urls.append(pages)
Data = []
hrefs = []
for info in urls:
page = requests.get(info)
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('div', attrs ={'class' :'sabai-directory-title'})
for link in links:
Data.extend([a['href'].encode('ascii') for a in link.find_all('a', href=True) if a.text])
pprint (Data)
output:
['https://directory.singaporefintech.org/directory/listing/silent-eight',
'https://directory.singaporefintech.org/directory/listing/moolahsense',
'https://directory.singaporefintech.org/directory/listing/myfinb',
'https://directory.singaporefintech.org/directory/listing/wefinance',
'https://directory.singaporefintech.org/directory/listing/quber',
'https://directory.singaporefintech.org/directory/listing/ayondo-asia-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/ceo-1',
'https://directory.singaporefintech.org/directory/listing/acekards',
'https://directory.singaporefintech.org/directory/listing/paper-ink-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/alpha-payments-cloud',
'https://directory.singaporefintech.org/directory/listing/samurai-fintech-singapore-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/corris-asset-management-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/fundmylife',
'https://directory.singaporefintech.org/directory/listing/mooments',
'https://directory.singaporefintech.org/directory/listing/venture-capital-network-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/junotele_',
'https://directory.singaporefintech.org/directory/listing/mobilecover',
'https://directory.singaporefintech.org/directory/listing/cherrypay',
'https://directory.singaporefintech.org/directory/listing/toast',
'https://directory.singaporefintech.org/directory/listing/cashdab',
'https://directory.singaporefintech.org/directory/listing/silent-eight',
'https://directory.singaporefintech.org/directory/listing/moolahsense',
'https://directory.singaporefintech.org/directory/listing/myfinb',
'https://directory.singaporefintech.org/directory/listing/wefinance',
'https://directory.singaporefintech.org/directory/listing/quber',
'https://directory.singaporefintech.org/directory/listing/ayondo-asia-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/ceo-1',
'https://directory.singaporefintech.org/directory/listing/acekards',
'https://directory.singaporefintech.org/directory/listing/paper-ink-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/alpha-payments-cloud',
'https://directory.singaporefintech.org/directory/listing/samurai-fintech-singapore-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/corris-asset-management-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/fundmylife',
'https://directory.singaporefintech.org/directory/listing/mooments',
'https://directory.singaporefintech.org/directory/listing/venture-capital-network-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/junotele_',
'https://directory.singaporefintech.org/directory/listing/mobilecover',
'https://directory.singaporefintech.org/directory/listing/cherrypay',
'https://directory.singaporefintech.org/directory/listing/toast',
'https://directory.singaporefintech.org/directory/listing/cashdab',
'https://directory.singaporefintech.org/directory/listing/silent-eight',
'https://directory.singaporefintech.org/directory/listing/moolahsense',
'https://directory.singaporefintech.org/directory/listing/myfinb',
'https://directory.singaporefintech.org/directory/listing/wefinance',
'https://directory.singaporefintech.org/directory/listing/quber',
'https://directory.singaporefintech.org/directory/listing/ayondo-asia-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/ceo-1',
'https://directory.singaporefintech.org/directory/listing/acekards',
'https://directory.singaporefintech.org/directory/listing/paper-ink-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/alpha-payments-cloud',
'https://directory.singaporefintech.org/directory/listing/samurai-fintech-singapore-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/corris-asset-management-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/fundmylife',
'https://directory.singaporefintech.org/directory/listing/mooments',
'https://directory.singaporefintech.org/directory/listing/venture-capital-network-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/junotele_',
'https://directory.singaporefintech.org/directory/listing/mobilecover',
'https://directory.singaporefintech.org/directory/listing/cherrypay',
'https://directory.singaporefintech.org/directory/listing/toast',
'https://directory.singaporefintech.org/directory/listing/cashdab',
'https://directory.singaporefintech.org/directory/listing/silent-eight',
'https://directory.singaporefintech.org/directory/listing/moolahsense',
'https://directory.singaporefintech.org/directory/listing/myfinb',
'https://directory.singaporefintech.org/directory/listing/wefinance',
'https://directory.singaporefintech.org/directory/listing/quber',
'https://directory.singaporefintech.org/directory/listing/ayondo-asia-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/ceo-1',
'https://directory.singaporefintech.org/directory/listing/acekards',
'https://directory.singaporefintech.org/directory/listing/paper-ink-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/alpha-payments-cloud',
'https://directory.singaporefintech.org/directory/listing/samurai-fintech-singapore-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/corris-asset-management-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/fundmylife',
'https://directory.singaporefintech.org/directory/listing/mooments',
'https://directory.singaporefintech.org/directory/listing/venture-capital-network-pte-ltd',
'https://directory.singaporefintech.org/directory/listing/junotele_',
'https://directory.singaporefintech.org/directory/listing/mobilecover',
'https://directory.singaporefintech.org/directory/listing/cherrypay',
'https://directory.singaporefintech.org/directory/listing/toast',
'https://directory.singaporefintech.org/directory/listing/cashdab']
Is this the data output you are expecting.
Hope it helps!!

Get links from a site's homepage using python

I want to write a script to get a home page's links to social media (twitter / facebook mostly), and I'm completely stuck since I am fairly new to Python.
The task I want to accomplish is to parse the website, find the social media links, and save it in a new data frame where each column would contain the original URL, the twitter link, and the facebook link. Here's what I have so far of this code for the new york times website:
from bs4 import BeautifulSoup
import requests
url = "http://www.nytimes.com"
r = requests.get(url)
sm_sites = ['twitter.com','facebook.com']
soup = BeautifulSoup(r.content, 'html5lib')
all_links = soup.find_all('a', href = True)
for site in sm_sites:
if all(site in sm_sites for link in all_links):
print(site)
else:
print('no link')
I'm having some problems understanding what the loop is doing, or how to make it work for what I need it to. I also had tried to store the site instead of doing print(site) but that was not working... So I figured I'd ask for help. Before asking, I went through a bunch of responses here but none could get me to do what I needed to do.

the way this code works, you already have your links. Your homepage link is the starting url, so http://www.nytimes.com.
And you have the social media urls sm_sites = ['twitter.com','facebook.com'], all you're doing is confirming they exist on the main page. If you want to save the list of confirmed social media urls, then append them to a list
Here is one way to get the social media links off a page
import requests
from bs4 import BeautifulSoup
url = "https://stackoverflow.com/questions/tagged/python"
r = requests.get(url)
sm_sites = ['twitter.com','facebook.com']
sm_sites_present = []
soup = BeautifulSoup(r.content, 'html5lib')
all_links = soup.find_all('a', href = True)
for sm_site in sm_sites:
for link in all_links:
if sm_site in link.attrs['href']:
sm_sites_present.append(link.attrs['href'])
print(sm_sites_present)
output:
['https://twitter.com/stackoverflow', 'https://www.facebook.com/officialstackoverflow/']
Update
for a df of urls
import requests
import pandas as pd
from bs4 import BeautifulSoup
from IPython.display import display
urls = [
"https://stackoverflow.com/questions/tagged/python",
"https://www.nytimes.com/",
"https://en.wikipedia.org/"
]
sm_sites = ['twitter.com','facebook.com']
sm_sites_present = []
columns = ['url'] + sm_sites
df = pd.DataFrame(data={'url' : urls}, columns=columns)
def get_sm(row):
r = requests.get(row['url'])
output = pd.Series()
soup = BeautifulSoup(r.content, 'html5lib')
all_links = soup.find_all('a', href = True)
for sm_site in sm_sites:
for link in all_links:
if sm_site in link.attrs['href']:
output[sm_site] = link.attrs['href']
return output
sm_columns = df.apply(get_sm, axis=1)
df.update(sm_columns)
df.fillna(value='no link')
output

This will do what you want with regards to adding it to a DataFrame. You can iterate through a list of websites (urlsToSearch), adding a row to the dataframe for each one containing the base website, all facebook links, and all twitter links.
from bs4 import BeautifulSoup
import requests
import pandas as pd
df = pd.DataFrame(columns=["Website", "Facebook", "Twitter"])
urlsToSearch = ["http://www.nytimes.com","http://www.businessinsider.com/"]
for url in urlsToSearch:
r = requests.get(url)
tw_links = []
fb_links = []
soup = BeautifulSoup(r.text, 'html.parser')
all_links = [link['href'] for link in soup.find_all('a', href = True)] #only get href
for link in all_links:
if "twitter.com" in link:
tw_links.append(link)
elif "facebook.com" in link:
fb_links.append(link)
df.loc[df.shape[0]] = [url,fb_links,tw_links] #Add row to end of df

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Web crawler - following links - python

Related

Pulling p tags from multiple URLs

Iterating over urls fails to find correct href in Python using BeautifulSoup

How can I get the correct urls of ads?

Web Scraping through Python BeautifulSoup

Get links from a site's homepage using python

Categories

Resources