We need to get an image out a link of a table. there is a table, in there is a link en in that link are images. What is the best way to get all the images of the website?
Now we have the text out the table and the links:
But how do we get the images of this site?
from bs4 import BeautifulSoup
url = "http://www.gestolenkunst.nl/gestolen%20overzicht.htm" # change to whatever your url is
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, "html.parser")
link = soup.a
def main():
"""scrape some wikipedia"""
for tr in soup.find_all('tr')[2:]:
tds = tr.find_all('td')
print "Datum: %s\n Plaats: %s\n Gestolen van: %s\n" %\
(tds[0].text.strip(), tds[1].text.strip(), tds[2].text.strip())
for link in soup.find_all('a'):
print link["href"]
print link.renderContents()
If you want all the links with href's that include images/..., you can use a select using a css-selector:
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, "html.parser")
print([a["href"] for a in soup.select("a[href^=images/]")])
Which will give you:
[u'images/2015/eindhoven/eindhoven.htm', u'images/2015/rtl4/rtl4_schilderij%20gezocht.htm', u'images/2015/eindhoven/eindhoven.htm', u'images/2015/rtl4/rtl4_schilderij%20gezocht.htm', u'images/2015/doetinchem/doetinchem.htm', u'images/2014/emmeloord/emmeloord.htm', u'images/2014/heelsum/heelsum.htm', u'images/2014/blaricum/blaricum.htm', u'images/2014/hilversum/hilversum.htm', u'images/2014/kerkrade/kerkrade.htm', u'images/2013/heerhugowaard/artfarm.htm', u'images/2013/sittard/sittard.htm', u'images/2013/rotterdam/rotterdan.htm', u'images/2013/epe/epe.htm', u'images/2013/beek/beek.htm', u'images/2012/utrecht/utrecht.htm', u'images/2012/amsterdam/amsterdam.htm', u'images/2012/zwijndrecht/zwijdrecht.htm', u'images/2012/de_bilt/bakker.htm', u'images/2012/zutphen/zutphen.htm', u'images/2012/rheden/carmiggelt.htm', u'images/2012/dieren/dieren.htm', u'images/2011/denhaag/denhaag.htm', u'images/2011/rotterdam/bronzenbeeld_rotterdam.htm', u'images/2011/utrecht/utrecht.htm', u'images/2012/denhaag/denhaag.htm', u'images/2011/oosterbeek/oosterbeek.htm', u'images/2011/teruggevonden_alkmaar/sumo_worstelaar_brons.htm', u'images/2011/teruggevonden_alkmaar/sumo_worstelaar_brons.htm', u'images/2011/vlierden/vlierden.htm', u'images/2011/bergen/bergen.htm', u'images/2011/alkmaar/mca.htm', u'images/2010/hendrik_ido_ambacht/hendrik-ido-ambacht.htm', u'images/2010/nijmegen/nijmegen.htm', u'images/2010/heesch', u'images/2010/boxie/boxie_gestolen_powerbook.htm', u'images/2010/ijmuiden/ijmuiden_nic_jonk.htm', u'images/2010/jitsbakker/bakkerjuli2010.htm', u'images/2010/gouda/gouda.htm', u'images/2010/enschede/enschede.htm', u'images/2010/someren/someren.htm', u'images/2010/jitsbakker/jitsbakker_steltloper_maart2010.htm', u'images/2009/hogeweg/hogeweg.htm', u'images/2009/wildevuur/wildevuur.htm', u'images/2009/bad-nieuweschans/heus.htm', u'images/2009/bad-nieuweschans/heus.htm', u'images/2009/darp/kneulman.htm', u'images/2009/keramikos/keramikos.htm', u'images/2009/maasbracht/maasbracht.htm', u'images/2008/groet/groet.htm', u'images/2009/rotterdam/rotterdam.htm', u'images/2009/rotterdam/rotterdam.htm', u'images/2008/swifterband/swifterband.htm', u'images/2008/laren_snoek/laren_snoek.htm', u'images/2008/beetsterzwaag/Beetsterzwaag.htm', u'images/2008/callantsoog/booghaard.htm', u'images/2008/lelystad/lelystad.htm', u'images/2008/amsterdam_kok/amsterdam_kok.htm', u'images/2008/lochem/lochem.htm', u'images/2008/liempde/liempde.htm', u'images/2008/heerhugowaard/zande.htm', u'images/2008/amsterdam_hatterman/amsterdam_hatterman.htm', u'images/2008/delft/delft.htm', u'images/2008/sgraveland/sgraveland.htm', u'images/2008/laren/laren110308.htm', u'images/2008/laren/laren110308.htm', u'images/2008/alphen_ad_rijn/alphen_ad_rijn.htm', u'images/2008/hardinxveld/hardinxveld.htm', u'images/2008/denhaag_karres/denhaag_karres.htm', u'images/2008/amsterdam_eijsenberger/amsterdam_eijs.htm', u'images/2008/amsterdam/amsterdam.htm', u'images/2008/denhaag/denhaag_brauw.htm', u'images/2008/groenhart/groenhart.htm', u'images/2007/aalsmeer/aalsmeer.htm', u'images/2007/delft/delft_klaus.htm', u'images/2007/malden/malden.htm', u'images/2007/sterksel/sterksel.htm', u'images/2007/zeist/zeist_achmea.htm', u'images/2007/maaseik_laar/maaseik.htm', u'images/2007/meerssen/meerssen.htm', u'images/2007/lisse/lisse.htm', u'images/2007/kortenhoef/kortenhoef.htm', u'images/2007/schijndel/schijndel.htm', u'images/2007/alkmaar/smit.htm', u'images/2007/heerlen/heerlen.htm', u'images/2007/heerlen/heerlen.htm', u'images/2007/tiel/kaayk.htm', u'images/2007/arnhem/arnhem.htm', u'images/2007/amsterdam_noort/amsterdam_noort.htm', u'images/2007/sgravenhage/sgravenhage.htm', u'images/2007/hazelaar/hazelaar.htm', u'images/2007/putte-stabroek/putte_stabroek.htm', u'images/2007/maarssen/maarssen_beeldentuin.htm', u'images/2007/huizen/huizen_gemeente.htm', u'images/2007/Maastricht_laar/maastricht_laar.htm', u'images/2007/bilthoven/bilthoven_v.htm', u'images/2007/sypesteyn/sypesteyn.htm', u'images/2007/hulzen/hulzen.htm', u'images/2007/huizen_limieten/huizen_limieten.htm', u'images/2007/elburg/elburg_galerie.htm', u'images/2007/hasselt/schildwacht_hasselt.htm', u'images/2006/woerden/woerden.htm', u'images/2006/amsterdam_slotervaart/sheils.htm', u'images/2006/recr_klepperstee_ouddorp/klepperstee_ouddorp.htm', u'images/2006/recr_klepperstee_ouddorp/klepperstee_ouddorp.htm', u'images/2007/erichem/krol.htm', u'images/2006/someren/someren.htm', u'images/2006/sliedrecht/sliedrecht_jitsbakker.htm', u'images/2006/blank/blank_2006.htm', u'images/2006/kemps_eindhoven/kemps_eindhoven.htm', u'images/2006/schoorl/begraafplaats_diels.htm', u'images/2006/bloemendaal/bloemendaal.htm', u'images/2010/zwolle/zwolle_leeser.htm', u'images/2006/vinkeveen/vinkeveen_jozefschool.htm', u'images/2006/gemeente_bergen/gemeente_bergen.htm', u'images/2006/alphen_ad_rijn/alphenadrijn.htm', u'images/2006/alphen_ad_rijn/alphenadrijn.htm', u'images/2006/Nieuwegein/nieuwegein.htm', u'images/2006/alkmaar_lange/lange.htm', u'images/2006/janneman/janneman.htm', u'images/2006/schoffelen/schoffelen.htm', u'images/2006/keuenhof_ede/keuenhof.htm', u'images/2006/rucphen/rucphen.htm', u'images/2006/lemberger_amsterdam/lemberger.htm', u'images/2006/bronckhorst/bronckhorst.htm', u'images/2006/arnhem_peja/peja.htm', u'images/2005/klomp/klomp.htm', u'images/2005/kalverda/kalverda.htm', u'images/2005/schellekens/schellekens.htm', u'images/2005/beeldhouwwinkel/beeldhouwwinkel.htm', u'images/2005/huisman/huisman.htm', u'images/2005/lith/lith.htm', u'images/2005/bergen/bergen_onna.htm', u'images/2005/emst_remeeus/emst.htm', u'images/2005/water/water.htm', u'images/2005/maastricht_drielsma/maastricht_drielsma.htm', u'images/2005/bosch/bosch.htm', u'images/2005/fransen/fransen.htm', u'images/vaart/vaart.htm', u'images/2005/lammertink/lammertink.htm', u'images/brocke/brocke.htm', u'images/wood/wood.htm', u'images/2005/klijzing/klijzing.htm', u'images/metalart/ponsioen.htm', u'images/harderwijk/harderwijk.htm', u'images/gulpen/gulpen.htm', u'images/limburg/limburg.htm', u'images/landgraaf/landgraaf.htm', u'images/pijnappel.htm', u'images/termaat/termaat.htm', u'images/vries/vries.htm', u'images/hartigh/hartigh.htm', u'images/hengelo/hengelo.htm', u'images/nijmegen/teunen/teunen.htm', u'images/nijmegen/nijmegen.htm', u'images/hollants/hollants_carla.htm', u'images/laren/laren_gils.htm', u'images/2003/smakt/smakt.htm', u'images/koopman/koopman.htm', u'images/voorschoten/voorschoten.htm', u'images/hagen/hagen.htm', u'images/bakker/gestolen%20kunst%20Jits%20Bakker.htm', u'images/bliekendaal/bliekendaal.htm', u'images/2003/utrecht/utrecht.htm', u'images/davina/davina.htm', u'images/janneman/janneman.htm', u'images/dijk/dijk.htm', u'images/clarissenbolwerk/havermans.htm', u'images/appelhof/appelhof.htm', u'images/blank/blank.htm', u'images/dussen/dussen.htm', u'images/bakker/gestolen%20kunst%20Jits%20Bakker.htm', u'images/rijs/rijs.htm', u'images/janssen', u'images/bakker/gestolen%20kunst%20Jits%20Bakker.htm', u'images/2002/nijssen/ouderamstel_nijssen.htm', u'images/onna/onna.htm', u'images/haring/haring.htm', u'images/dijk/dijk.htm', u'images/janneman/janneman.htm', u'images/hessels/hessels.htm', u'images/onna/onna.htm', u'images/2012/culemborg/culemborg_1998.htm', u'images/1998/mierlo/mierlo.htm', u'images/oud/wigbold/wigbold.htm', u'images/oud/nijmegen/nijmegen.htm', u'images/oud/amsterdam_ommen/amstedam_ommen.htm']
a[href^=images/] simply means find all the anchor tags that have a href attribute where tha value starts with images/.., then we just pull the href from each anchor.
To use the links you will need to join the link to the main url:
from bs4 import BeautifulSoup
from urlparse import urljoin
url = "http://www.gestolenkunst.nl/gestolen%20overzicht.htm" # change to whatever your url is
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, "html.parser")
base = "http://www.gestolenkunst.nl/"
images = [urljoin(base,a["href"]) for a in soup.select("a[href^=images/]")]
print(images)
Output:
[u'http://www.gestolenkunst.nl/images/2015/eindhoven/eindhoven.htm', u'http://www.gestolenkunst.nl/images/2015/rtl4/rtl4_schilderij%20gezocht.htm', u'http://www.gestolenkunst.nl/images/2015/eindhoven/eindhoven.htm', u'http://www.gestolenkunst.nl/images/2015/rtl4/rtl4_schilderij%20gezocht.htm', u'http://www.gestolenkunst.nl/images/2015/doetinchem/doetinchem.htm', u'http://www.gestolenkunst.nl/images/2014/emmeloord/emmeloord.htm', u'http://www.gestolenkunst.nl/images/2014/heelsum/heelsum.htm', u'http://www.gestolenkunst.nl/images/2014/blaricum/blaricum.htm', u'http://www.gestolenkunst.nl/images/2014/hilversum/hilversum.htm', u'http://www.gestolenkunst.nl/images/2014/kerkrade/kerkrade.htm', u'http://www.gestolenkunst.nl/images/2013/heerhugowaard/artfarm.htm', u'http://www.gestolenkunst.nl/images/2013/sittard/sittard.htm', u'http://www.gestolenkunst.nl/images/2013/rotterdam/rotterdan.htm', u'http://www.gestolenkunst.nl/images/2013/epe/epe.htm', u'http://www.gestolenkunst.nl/images/2013/beek/beek.htm', u'http://www.gestolenkunst.nl/images/2012/utrecht/utrecht.htm', u'http://www.gestolenkunst.nl/images/2012/amsterdam/amsterdam.htm', u'http://www.gestolenkunst.nl/images/2012/zwijndrecht/zwijdrecht.htm', u'http://www.gestolenkunst.nl/images/2012/de_bilt/bakker.htm', u'http://www.gestolenkunst.nl/images/2012/zutphen/zutphen.htm', u'http://www.gestolenkunst.nl/images/2012/rheden/carmiggelt.htm', u'http://www.gestolenkunst.nl/images/2012/dieren/dieren.htm', u'http://www.gestolenkunst.nl/images/2011/denhaag/denhaag.htm', u'http://www.gestolenkunst.nl/images/2011/rotterdam/bronzenbeeld_rotterdam.htm', u'http://www.gestolenkunst.nl/images/2011/utrecht/utrecht.htm', u'http://www.gestolenkunst.nl/images/2012/denhaag/denhaag.htm', u'http://www.gestolenkunst.nl/images/2011/oosterbeek/oosterbeek.htm', u'http://www.gestolenkunst.nl/images/2011/teruggevonden_alkmaar/sumo_worstelaar_brons.htm', u'http://www.gestolenkunst.nl/images/2011/teruggevonden_alkmaar/sumo_worstelaar_brons.htm', u'http://www.gestolenkunst.nl/images/2011/vlierden/vlierden.htm', u'http://www.gestolenkunst.nl/images/2011/bergen/bergen.htm', u'http://www.gestolenkunst.nl/images/2011/alkmaar/mca.htm', u'http://www.gestolenkunst.nl/images/2010/hendrik_ido_ambacht/hendrik-ido-ambacht.htm', u'http://www.gestolenkunst.nl/images/2010/nijmegen/nijmegen.htm', u'http://www.gestolenkunst.nl/images/2010/heesch', u'http://www.gestolenkunst.nl/images/2010/boxie/boxie_gestolen_powerbook.htm', u'http://www.gestolenkunst.nl/images/2010/ijmuiden/ijmuiden_nic_jonk.htm', u'http://www.gestolenkunst.nl/images/2010/jitsbakker/bakkerjuli2010.htm', u'http://www.gestolenkunst.nl/images/2010/gouda/gouda.htm', u'http://www.gestolenkunst.nl/images/2010/enschede/enschede.htm', u'http://www.gestolenkunst.nl/images/2010/someren/someren.htm', u'http://www.gestolenkunst.nl/images/2010/jitsbakker/jitsbakker_steltloper_maart2010.htm', u'http://www.gestolenkunst.nl/images/2009/hogeweg/hogeweg.htm', u'http://www.gestolenkunst.nl/images/2009/wildevuur/wildevuur.htm', u'http://www.gestolenkunst.nl/images/2009/bad-nieuweschans/heus.htm', u'http://www.gestolenkunst.nl/images/2009/bad-nieuweschans/heus.htm', u'http://www.gestolenkunst.nl/images/2009/darp/kneulman.htm', u'http://www.gestolenkunst.nl/images/2009/keramikos/keramikos.htm', u'http://www.gestolenkunst.nl/images/2009/maasbracht/maasbracht.htm', u'http://www.gestolenkunst.nl/images/2008/groet/groet.htm', u'http://www.gestolenkunst.nl/images/2009/rotterdam/rotterdam.htm', u'http://www.gestolenkunst.nl/images/2009/rotterdam/rotterdam.htm', u'http://www.gestolenkunst.nl/images/2008/swifterband/swifterband.htm', u'http://www.gestolenkunst.nl/images/2008/laren_snoek/laren_snoek.htm', u'http://www.gestolenkunst.nl/images/2008/beetsterzwaag/Beetsterzwaag.htm', u'http://www.gestolenkunst.nl/images/2008/callantsoog/booghaard.htm', u'http://www.gestolenkunst.nl/images/2008/lelystad/lelystad.htm', u'http://www.gestolenkunst.nl/images/2008/amsterdam_kok/amsterdam_kok.htm', u'http://www.gestolenkunst.nl/images/2008/lochem/lochem.htm', u'http://www.gestolenkunst.nl/images/2008/liempde/liempde.htm', u'http://www.gestolenkunst.nl/images/2008/heerhugowaard/zande.htm', u'http://www.gestolenkunst.nl/images/2008/amsterdam_hatterman/amsterdam_hatterman.htm', u'http://www.gestolenkunst.nl/images/2008/delft/delft.htm', u'http://www.gestolenkunst.nl/images/2008/sgraveland/sgraveland.htm', u'http://www.gestolenkunst.nl/images/2008/laren/laren110308.htm', u'http://www.gestolenkunst.nl/images/2008/laren/laren110308.htm', u'http://www.gestolenkunst.nl/images/2008/alphen_ad_rijn/alphen_ad_rijn.htm', u'http://www.gestolenkunst.nl/images/2008/hardinxveld/hardinxveld.htm', u'http://www.gestolenkunst.nl/images/2008/denhaag_karres/denhaag_karres.htm', u'http://www.gestolenkunst.nl/images/2008/amsterdam_eijsenberger/amsterdam_eijs.htm', u'http://www.gestolenkunst.nl/images/2008/amsterdam/amsterdam.htm', u'http://www.gestolenkunst.nl/images/2008/denhaag/denhaag_brauw.htm', u'http://www.gestolenkunst.nl/images/2008/groenhart/groenhart.htm', u'http://www.gestolenkunst.nl/images/2007/aalsmeer/aalsmeer.htm', u'http://www.gestolenkunst.nl/images/2007/delft/delft_klaus.htm', u'http://www.gestolenkunst.nl/images/2007/malden/malden.htm', u'http://www.gestolenkunst.nl/images/2007/sterksel/sterksel.htm', u'http://www.gestolenkunst.nl/images/2007/zeist/zeist_achmea.htm', u'http://www.gestolenkunst.nl/images/2007/maaseik_laar/maaseik.htm', u'http://www.gestolenkunst.nl/images/2007/meerssen/meerssen.htm', u'http://www.gestolenkunst.nl/images/2007/lisse/lisse.htm', u'http://www.gestolenkunst.nl/images/2007/kortenhoef/kortenhoef.htm', u'http://www.gestolenkunst.nl/images/2007/schijndel/schijndel.htm', u'http://www.gestolenkunst.nl/images/2007/alkmaar/smit.htm', u'http://www.gestolenkunst.nl/images/2007/heerlen/heerlen.htm', u'http://www.gestolenkunst.nl/images/2007/heerlen/heerlen.htm', u'http://www.gestolenkunst.nl/images/2007/tiel/kaayk.htm', u'http://www.gestolenkunst.nl/images/2007/arnhem/arnhem.htm', u'http://www.gestolenkunst.nl/images/2007/amsterdam_noort/amsterdam_noort.htm', u'http://www.gestolenkunst.nl/images/2007/sgravenhage/sgravenhage.htm', u'http://www.gestolenkunst.nl/images/2007/hazelaar/hazelaar.htm', u'http://www.gestolenkunst.nl/images/2007/putte-stabroek/putte_stabroek.htm', u'http://www.gestolenkunst.nl/images/2007/maarssen/maarssen_beeldentuin.htm', u'http://www.gestolenkunst.nl/images/2007/huizen/huizen_gemeente.htm', u'http://www.gestolenkunst.nl/images/2007/Maastricht_laar/maastricht_laar.htm', u'http://www.gestolenkunst.nl/images/2007/bilthoven/bilthoven_v.htm', u'http://www.gestolenkunst.nl/images/2007/sypesteyn/sypesteyn.htm', u'http://www.gestolenkunst.nl/images/2007/hulzen/hulzen.htm', u'http://www.gestolenkunst.nl/images/2007/huizen_limieten/huizen_limieten.htm', u'http://www.gestolenkunst.nl/images/2007/elburg/elburg_galerie.htm', u'http://www.gestolenkunst.nl/images/2007/hasselt/schildwacht_hasselt.htm', u'http://www.gestolenkunst.nl/images/2006/woerden/woerden.htm', u'http://www.gestolenkunst.nl/images/2006/amsterdam_slotervaart/sheils.htm', u'http://www.gestolenkunst.nl/images/2006/recr_klepperstee_ouddorp/klepperstee_ouddorp.htm', u'http://www.gestolenkunst.nl/images/2006/recr_klepperstee_ouddorp/klepperstee_ouddorp.htm', u'http://www.gestolenkunst.nl/images/2007/erichem/krol.htm', u'http://www.gestolenkunst.nl/images/2006/someren/someren.htm', u'http://www.gestolenkunst.nl/images/2006/sliedrecht/sliedrecht_jitsbakker.htm', u'http://www.gestolenkunst.nl/images/2006/blank/blank_2006.htm', u'http://www.gestolenkunst.nl/images/2006/kemps_eindhoven/kemps_eindhoven.htm', u'http://www.gestolenkunst.nl/images/2006/schoorl/begraafplaats_diels.htm', u'http://www.gestolenkunst.nl/images/2006/bloemendaal/bloemendaal.htm', u'http://www.gestolenkunst.nl/images/2010/zwolle/zwolle_leeser.htm', u'http://www.gestolenkunst.nl/images/2006/vinkeveen/vinkeveen_jozefschool.htm', u'http://www.gestolenkunst.nl/images/2006/gemeente_bergen/gemeente_bergen.htm', u'http://www.gestolenkunst.nl/images/2006/alphen_ad_rijn/alphenadrijn.htm', u'http://www.gestolenkunst.nl/images/2006/alphen_ad_rijn/alphenadrijn.htm', u'http://www.gestolenkunst.nl/images/2006/Nieuwegein/nieuwegein.htm', u'http://www.gestolenkunst.nl/images/2006/alkmaar_lange/lange.htm', u'http://www.gestolenkunst.nl/images/2006/janneman/janneman.htm', u'http://www.gestolenkunst.nl/images/2006/schoffelen/schoffelen.htm', u'http://www.gestolenkunst.nl/images/2006/keuenhof_ede/keuenhof.htm', u'http://www.gestolenkunst.nl/images/2006/rucphen/rucphen.htm', u'http://www.gestolenkunst.nl/images/2006/lemberger_amsterdam/lemberger.htm', u'http://www.gestolenkunst.nl/images/2006/bronckhorst/bronckhorst.htm', u'http://www.gestolenkunst.nl/images/2006/arnhem_peja/peja.htm', u'http://www.gestolenkunst.nl/images/2005/klomp/klomp.htm', u'http://www.gestolenkunst.nl/images/2005/kalverda/kalverda.htm', u'http://www.gestolenkunst.nl/images/2005/schellekens/schellekens.htm', u'http://www.gestolenkunst.nl/images/2005/beeldhouwwinkel/beeldhouwwinkel.htm', u'http://www.gestolenkunst.nl/images/2005/huisman/huisman.htm', u'http://www.gestolenkunst.nl/images/2005/lith/lith.htm', u'http://www.gestolenkunst.nl/images/2005/bergen/bergen_onna.htm', u'http://www.gestolenkunst.nl/images/2005/emst_remeeus/emst.htm', u'http://www.gestolenkunst.nl/images/2005/water/water.htm', u'http://www.gestolenkunst.nl/images/2005/maastricht_drielsma/maastricht_drielsma.htm', u'http://www.gestolenkunst.nl/images/2005/bosch/bosch.htm', u'http://www.gestolenkunst.nl/images/2005/fransen/fransen.htm', u'http://www.gestolenkunst.nl/images/vaart/vaart.htm', u'http://www.gestolenkunst.nl/images/2005/lammertink/lammertink.htm', u'http://www.gestolenkunst.nl/images/brocke/brocke.htm', u'http://www.gestolenkunst.nl/images/wood/wood.htm', u'http://www.gestolenkunst.nl/images/2005/klijzing/klijzing.htm', u'http://www.gestolenkunst.nl/images/metalart/ponsioen.htm', u'http://www.gestolenkunst.nl/images/harderwijk/harderwijk.htm', u'http://www.gestolenkunst.nl/images/gulpen/gulpen.htm', u'http://www.gestolenkunst.nl/images/limburg/limburg.htm', u'http://www.gestolenkunst.nl/images/landgraaf/landgraaf.htm', u'http://www.gestolenkunst.nl/images/pijnappel.htm', u'http://www.gestolenkunst.nl/images/termaat/termaat.htm', u'http://www.gestolenkunst.nl/images/vries/vries.htm', u'http://www.gestolenkunst.nl/images/hartigh/hartigh.htm', u'http://www.gestolenkunst.nl/images/hengelo/hengelo.htm', u'http://www.gestolenkunst.nl/images/nijmegen/teunen/teunen.htm', u'http://www.gestolenkunst.nl/images/nijmegen/nijmegen.htm', u'http://www.gestolenkunst.nl/images/hollants/hollants_carla.htm', u'http://www.gestolenkunst.nl/images/laren/laren_gils.htm', u'http://www.gestolenkunst.nl/images/2003/smakt/smakt.htm', u'http://www.gestolenkunst.nl/images/koopman/koopman.htm', u'http://www.gestolenkunst.nl/images/voorschoten/voorschoten.htm', u'http://www.gestolenkunst.nl/images/hagen/hagen.htm', u'http://www.gestolenkunst.nl/images/bakker/gestolen%20kunst%20Jits%20Bakker.htm', u'http://www.gestolenkunst.nl/images/bliekendaal/bliekendaal.htm', u'http://www.gestolenkunst.nl/images/2003/utrecht/utrecht.htm', u'http://www.gestolenkunst.nl/images/davina/davina.htm', u'http://www.gestolenkunst.nl/images/janneman/janneman.htm', u'http://www.gestolenkunst.nl/images/dijk/dijk.htm', u'http://www.gestolenkunst.nl/images/clarissenbolwerk/havermans.htm', u'http://www.gestolenkunst.nl/images/appelhof/appelhof.htm', u'http://www.gestolenkunst.nl/images/blank/blank.htm', u'http://www.gestolenkunst.nl/images/dussen/dussen.htm', u'http://www.gestolenkunst.nl/images/bakker/gestolen%20kunst%20Jits%20Bakker.htm', u'http://www.gestolenkunst.nl/images/rijs/rijs.htm', u'http://www.gestolenkunst.nl/images/janssen', u'http://www.gestolenkunst.nl/images/bakker/gestolen%20kunst%20Jits%20Bakker.htm', u'http://www.gestolenkunst.nl/images/2002/nijssen/ouderamstel_nijssen.htm', u'http://www.gestolenkunst.nl/images/onna/onna.htm', u'http://www.gestolenkunst.nl/images/haring/haring.htm', u'http://www.gestolenkunst.nl/images/dijk/dijk.htm', u'http://www.gestolenkunst.nl/images/janneman/janneman.htm', u'http://www.gestolenkunst.nl/images/hessels/hessels.htm', u'http://www.gestolenkunst.nl/images/onna/onna.htm', u'http://www.gestolenkunst.nl/images/2012/culemborg/culemborg_1998.htm', u'http://www.gestolenkunst.nl/images/1998/mierlo/mierlo.htm', u'http://www.gestolenkunst.nl/images/oud/wigbold/wigbold.htm', u'http://www.gestolenkunst.nl/images/oud/nijmegen/nijmegen.htm', u'http://www.gestolenkunst.nl/images/oud/amsterdam_ommen/amstedam_ommen.htm']
You could be more specific with the select using the td "td a[href^=images/]" but it will return the same as all those links are in td tags.
The actually image are in the linked pages so we need to visit each link, find the img tag, extract the src then download and save the image:
from bs4 import BeautifulSoup
from urlparse import urljoin
url = "http://www.gestolenkunst.nl/gestolen%20overzicht.htm" # change to whatever your url is
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, "html.parser")
base = "http://www.gestolenkunst.nl/"
images = [urljoin(base,a["href"]) for a in soup.select("td a[href^=images/]")]
for url in images:
img = BeautifulSoup(urllib2.urlopen(url).read(),"lxml").find("img")["src"]
with open(img,"w") as f:
f.write(urllib2.urlopen("{}/{}".format(url.rsplit("/", 1)[0], img)).read())
So for http://www.gestolenkunst.nl/images/2015/eindhoven/eindhoven.htm we get:
Please bear with me. I am quite new at Python - but having a lot of fun. I am trying to code a web crawler that crawls through election results from the last referendum in Denmark. I have managed to extract all the relevant links from the main page. And now I want Python to follow each of the 92 links and gather 9 pieces of information from each of those pages. But I am so stuck. Hope you can give me a hint.
Here is my code:
import requests
import urllib2
from bs4 import BeautifulSoup
# This is the original url http://www.kmdvalg.dk/
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
my_list = []
all_links = soup.find_all("a")
for link in all_links:
link2 = link["href"]
my_list.append(link2)
for i in my_list[1:93]:
print i
# The output shows all the links that I would like to follow and gather information from. How do I do that?
Here is my solution using lxml. It's similar to BeautifulSoup
import lxml
from lxml import html
import requests
page = requests.get('http://www.kmdvalg.dk/main')
tree = html.fromstring(page.content)
my_list = tree.xpath('//div[#class="LetterGroup"]//a/#href') # grab all link
print 'Length of all links = ', len(my_list)
my_list is a list consist of all links. And now you can use for loop to scrape information inside each page.
We can for loop through each links. Inside each page, you can extract information as example. This is only for the top table.
table_information = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
table_key = tree.xpath('//td[#class="statusHeader"]/text()')
table_value = tree.xpath('//td[#class="statusText"]/text()') + tree.xpath('//td[#class="statusText"]/a/text()')
table_information.append(zip([t]*len(table_key), table_key, table_value))
For table below the page,
table_information_below = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
l1 = tree.xpath('//tr[#class="tableRowPrimary"]/td[#class="StemmerNu"]/text()')
l2 = tree.xpath('//tr[#class="tableRowSecondary"]/td[#class="StemmerNu"]/text()')
table_information_below.append([t]+l1+l2)
Hope this help!
A simple approach would be to iterate through your list of urls and parse them each individually:
for url in my_list:
soup = BeautifulSoup(urllib2.urlopen(url).read())
# then parse each page individually here
Alternatively, you could speed things up significantly using Futures.
from requests_futures.sessions import FuturesSession
def my_parse_function(html):
"""Use this function to parse each page"""
soup = BeautifulSoup(html)
all_paragraphs = soup.find_all('p')
return all_paragraphs
session = FuturesSession(max_workers=5)
futures = [session.get(url) for url in my_list]
page_results = [my_parse_function(future.result()) for future in results]
This would be my solution for your problem
import requests
from bs4 import BeautifulSoup
def spider():
url = "http://www.kmdvalg.dk/main"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('div', {'class': 'LetterGroup'}):
anc = link.find('a')
href = anc.get('href')
print(anc.getText())
print(href)
# spider2(href) call a second function from here that is similar to this one(making url = to herf)
spider2(href)
print("\n")
def spider2(linktofollow):
url = linktofollow
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('tr', {'class': 'tableRowPrimary'}):
anc = link.find('td')
print(anc.getText())
print("\n")
spider()
its not done... i only get a simple element from the table but you get the idea and how its supposed to work.
Here is my final code that works smooth. Please let me know if I could have done it smarter!
import urllib2
from bs4 import BeautifulSoup
import codecs
f = codecs.open("eu2015valg.txt", "w", encoding="iso-8859-1")
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
liste = []
alle_links = soup.find_all("a")
for link in alle_links:
link2 = link["href"]
liste.append(link2)
for url in liste[1:93]:
soup = BeautifulSoup(urllib2.urlopen(url).read().decode('iso-8859-1'))
tds = soup.findAll('td')
stemmernu = soup.findAll('td', class_='StemmerNu')
print >> f, tds[5].string,";",tds[12].string,";",tds[14].string,";",tds[16].string,";", stemmernu[0].string,";",stemmernu[1].string,";",stemmernu[2].string,";",stemmernu[3].string,";",stemmernu[6].string,";",stemmernu[8].string,";",'\r\n'
f.close()