Can't scrape .aspx sites - python

I am trying to scrape numerous companies sites in Python for their news releases.
I figured out I need to use chickennoodle = soup(html_text, 'lxml') instead of chickennoodle = soup(html_text, 'html.parser') for aspx sites. I am still getting the basic urls back like their contact and careers links instead of the actual news article links. When I inspect the website it looks something like:
<a class="module_headline-link" href="/news-and-events/news/news-details/2022/Compugen-to-Release-Second-Quarter-Results-on-Thursday-August-4-2022/default.aspx">Compugen to Release Second Quarter Results on Thursday, August 4, 2022</a>.
On the basic html sites it works to print all of my_links and I can filter which link by the hashed out lines. I thought I'd add a few examples of troubled scrapes and one of a working one. I assume the not working ones are the same problem and probably due to not understanding the intricacies of lxml. I just assume it can't see the articles for some reason (unlike the html) because they start with /. Thanks for any help.
COMPANY 1-
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
URL = 'https://ir.cgen.com/news-and-events/news/default.aspx'
full = ''
html_text = requests.get(URL).text
chickennoodle = soup(html_text, 'lxml')
for link in chickennoodle.find_all('a'):
my_links = (link.get('href'))
print(my_links)
#if str(my_links).startswith("/news-and-events/news/news-details/"):
# print(str(full)+my_links)
#else:
# None
COMPANY 2-
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
URL = 'https://www.meipharma.com/media/press-releases'
full = ''
html_text = requests.get(URL).text
chickennoodle = soup(html_text, 'html.parser')
for link in chickennoodle.find_all('a'):
my_links = (link.get('href'))
print(my_links)
# if str(my_links).startswith(""):
# print(str(full)+my_links)
# else:
# None
COMPANY 3-
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
URL = 'https://investor.sierraoncology.com/news-releases/default.aspx'
full = ''
html_text = requests.get(URL).text
chickennoodle = soup(html_text, 'lxml')
for link in chickennoodle.find_all('a'):
my_links = (link.get('href'))
print(my_links)
VS html site that works for my purposes
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
URL = "https://investors.aileronrx.com/index.php/news-releases"
full = "https://investors.aileronrx.com"
ALRNlinks = []
html_text = requests.get(URL).text
chickennoodle = soup(html_text, 'html.parser')
for link in chickennoodle.find_all('a'):
my_links = (link.get('href'))
if str(my_links).startswith("/news-rele"):
ALRN = (str(full)+my_links)
ALRNlinks.append(ALRN)
print(ALRNlinks)

The website from your first example is loading information dynamically in page, so requests won't see the information pulled by javascript, after the page loaded. You can however look into Dev Tools - network tab, and see which urls are being accessed by javascript, and try and scrape those. For example:
import requests
import pandas as pd
url = 'https://ir.cgen.com/feed/PressRelease.svc/GetPressReleaseList?LanguageId=1&bodyType=0&pressReleaseDateFilter=3&categoryId=1cb807d2-208f-4bc3-9133-6a9ad45ac3b0&pageSize=-1&pageNumber=0&tagList=&includeTags=true&year=2022&excludeSelection=1'
r = requests.get(url)
df = pd.DataFrame(r.json()['GetPressReleaseListResult'])
print(df)
This will print out:
Attachments Body Category DocumentFileSize DocumentFileType DocumentPath ExcludeFromLatest Headline LanguageId LinkToDetailPage ... RevisionNumber SeoName ShortBody ShortDescription Subheadline SubheadlineHtml TagsList ThumbnailPath WorkflowId PressReleaseDate
0 [] None PDF https://s26.q4cdn.com/977440944/files/doc_news... False Compugen to Release Second Quarter Results on ... 1 /news-and-events/news/news-details/2022/Compug... ... 33221 Compugen-to-Release-Second-Quarter-Results-on-... None None None [] https://s26.q4cdn.com/977440944/files/doc_news... e7b13fbb-ddc7-4955-a9c6-b44e6ab223ec 07/21/2022 07:00:00
1 [] None PDF https://s26.q4cdn.com/977440944/files/doc_news... False Compugen to Present at Upcoming Industry Confe... 1 /news-and-events/news/news-details/2022/Compug... ... 33213 Compugen-to-Present-at-Upcoming-Industry-Confe... None None None [] https://s26.q4cdn.com/977440944/files/doc_news... 1e5cb121-a9f7-4e1b-86c1-1571065d40b5 06/27/2022 07:00:00
2 [] None PDF https://s26.q4cdn.com/977440944/files/doc_news... False Compugen to Present at Upcoming Investor Confe... 1 /news-and-events/news/news-details/2022/Compug... ... 33202 Compugen-to-Present-at-Upcoming-Investor-Confe... None None None [] https://s26.q4cdn.com/977440944/files/doc_news... 8c004950-09c8-4831-bdfa-25f660afe250 06/01/2022 07:00:00
[...]
You can apply this for your other examples as well.

Related

Pulling p tags from multiple URLs

I've struggled on this for days and not sure what the issue could be - basically, I'm trying to extract the profile box data (picture below) of each link -- going through inspector, I thought I could pull the p tags and do so.
I'm new to this and trying to understand, but here's what I have thus far:
-- a code that (somewhat) succesfully pulls the info for ONE link:
import requests
from bs4 import BeautifulSoup
# getting html
url = 'https://basketball.realgm.com/player/Darius-Adams/Summary/28720'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
playerinfo = container.find_all('p')
print(playerinfo)
I then also have a code that pulls all of the HREF tags from multiple links:
from bs4 import BeautifulSoup
import requests
def get_links(url):
links = []
website = requests.get(url)
website_text = website.text
soup = BeautifulSoup(website_text)
for link in soup.find_all('a'):
links.append(link.get('href'))
for link in links:
print(link)
print(len(links))
get_links('https://basketball.realgm.com/dleague/players/2022')
get_links('https://basketball.realgm.com/dleague/players/2021')
get_links('https://basketball.realgm.com/dleague/players/2020')
So basically, my goal is to combine these two, and get one code that will pull all of the P tags from multiple URLs. I've been trying to do it, and I'm really not sure at all why this isn't working here:
from bs4 import BeautifulSoup
import requests
def get_profile(url):
profiles = []
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
for profile in container.find_all('a'):
profiles.append(profile.get('p'))
for profile in profiles:
print(profile)
get_profile('https://basketball.realgm.com/player/Darius-Adams/Summary/28720')
get_profile('https://basketball.realgm.com/player/Marial-Shayok/Summary/26697')
Again, I'm really new to web scraping with Python but any advice would be greatly appreciated. Ultimately, my end goal is to have a tool that can scrape this data in a clean way all at once.
(Player name, Current Team, Born, Birthplace, etc).. maybe I'm doing it entirely wrong but any guidance is welcome!
You need to combine your two scripts together and make requests for each player. Try the following approach. This searches for <td> tags that have the data-td=Player attribute:
import requests
from bs4 import BeautifulSoup
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th' : 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name" : name, "URL" : player_url}
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
]
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
for entry in data:
print(entry)

Pulling all yelp reviews via beautifulsoup

I need some help in pulling all reviews for a hotel using beautiful soup; this is what i have thus far, but i need some inspiration pulling all the reviews via API or regular.
import time
import random
from bs4 import BeautifulSoup as bs
import urllib.request as url
html = urllib.request.urlopen('https://www.yelp.com/biz/shore-cliff-hotel-pismo-beach-2').read().decode('utf-8')
soup = bs(html, 'html.parser')
relevant= soup.find_all('p', class_='comment__09f24__gu0rG css-qgunke')
reviews = []
for div in relevant:
for html_class in div.find_all('span',class_="raw__09f24__T4Ezm"):
text = html_class.find('span')
review = html_class.getText(
reviews.append(review)
enter code here
This does the job,
base_url = "https://www.yelp.com/biz/capri-laguna-laguna-beach"
new_page = "?start={}"
content = requests.get(url).content
soup = BeautifulSoup(content, "html.parser")
reviews = []
for i in range(0, 501, 10):
new_page_url = url + new_page.format(i)
new_content = requests.get(url).content
new_soup = BeautifulSoup(content, "html.parser")
relevant= new_soup.find_all('p', class_='comment__09f24__gu0rG css-qgunke')
for div in relevant:
for html_class in div.find_all('span',class_="raw__09f24__T4Ezm"):
text = html_class.find('span')
review = html_class.getText()
reviews.append(review)
Code explaination -
If you click to go to the 2nd page you'll see that ?start=10 get's add to the base URL https://www.yelp.com/biz/capri-laguna-laguna-beach. If you go to the 3rd page then you'll see ?start=20 and so on. The number here is the index of the review, and each page has 10 of them. There are 51 total pages meaning the first review on the 51st page would have the index 501. So the added part to the URL would be ?start=500.
So for each page on the website, the code creates a new URL, gets the HTML content of that URL, creates a soup for it and fetches the review from this newly created soup.

How can I get the correct urls of ads?

I'm trying to scrape the urls of the ads on "Marktplaats" website (link is provided below).
As you can see I'm looking for 30 URLs. These URLs are placed inside a 'href' field and all start with "/a/auto-s/". Unfortunately, I only keep getting the first few URLs. I found out that on this sites all the data is places within "<li class = "mp-Listing mp-Listing--list-item"> ... </li>". Does anyone have an idea how to fix it? (you can see that you won't find all the URLs of the ads when you run my code)
Link:
https://www.marktplaats.nl/l/auto-s/#f:10882,10898|PriceCentsTo:350000|constructionYearFrom:2001|offeredSince:TODAY|searchInTitleAndDescription:true
My code:
import requests
from bs4 import BeautifulSoup
url = "https://www.marktplaats.nl/l/auto-s/#f:10882,10898|PriceCentsTo:350000|constructionYearFrom:2001|offeredSince:TODAY|searchInTitleAndDescription:true"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
url_list = soup.find_all(class_ = 'mp-Listing mp-Listing--list-item')
print(url_list)
You can try something like this:
import requests
from bs4 import BeautifulSoup
def parse_links(url):
links = []
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for li in soup.find_all(class_="mp-Listing mp-Listing--list-item"):
links.append(li.a.get('href'))
return links
url = "https://www.marktplaats.nl/l/auto-s/#f:10882,10898|PriceCentsTo:350000|constructionYearFrom:2001|offeredSince:TODAY|searchInTitleAndDescription:true"
links = parse_links(url)
print('\n'.join(map(str, links)))
Output
/a/auto-s/oldtimers/a1302359148-allis-chalmers-ed40-1965.html
/a/auto-s/bestelauto-s/a1258166221-opel-movano-2-3-cdti-96kw-2018.html
/a/auto-s/oldtimers/a1302359184-chevrolet-biscayne-bel-air-1960.html
/a/auto-s/renault/a1240974413-ruim-aanbod-rolstoelauto-s-www-autoland-nl.html
/a/auto-s/volkswagen/m1457703674-golf-6-1-2tsi-comfortline-bluemotion-77kw-2de-eigenaar.html
/a/auto-s/peugeot/m1457564187-peugeot-208-1-6-e-hdi-68kw-92pk-5-d-2014-zwart.html
/a/auto-s/volkswagen/m1457124365-volkswagen-touareg-3-2-v6-177kw-4motion-aut-2004-grijs.html
/a/auto-s/volkswagen/m1456753596-volkswagen-golf-vii-2-0-tdi-highline-150pk-xenon-trekhaak.html
/a/auto-s/bestelauto-s/a1001658686-200-nw-en-gebruikte-bestelwagens-personenbusjes-pick-ups.html
/a/auto-s/bestelauto-s/m940111355-bus-verkopen-bestelauto-inkoop-bestelwagen-opkoper-rdw.html
/a/auto-s/volkswagen/m1456401063-volkswagen-golf-1-6-74kw-2000-zwart.html
/a/auto-s/renault/m1456242548-renault-espace-2-0-dci-110kw-e4-2006-zwart.html
/a/auto-s/nissan/m1448699345-nissan-qashqai-1-5-dci-connect-2011-grijs-panoramadak.html
/a/auto-s/bestelauto-s/a1212708374-70-x-kleine-bestelwagens-lage-km-scherpe-prijzen.html
/a/auto-s/bmw/m1452641019-bmw-5-serie-2-0-520d-touring-aut-2014-grijs.html
/a/auto-s/mercedes-benz/m1448671698-mercedes-benz-a-klasse-a250-amg-224pk-7g-dct-panoramadak-wid.html
/a/auto-s/bmw/m1455671862-bmw-3-serie-2-0-i-320-cabrio-aut-2007-bruin.html
/a/auto-s/bestelauto-s/m1455562699-volkswagen-transporter-kmstand-151-534-2-5-tdi-65kw-2002.html
/a/auto-s/bestelauto-s/a1295698562-35-x-renault-kangoo-2013-t-m-2015-v-a-25000-km.html
/a/auto-s/infiniti/m1458111256-infiniti-q50-3-5-hybrid-awd-2016-grijs.html
/a/auto-s/ford/m1458111166-ford-ka-1-3-i-44kw-2007-zwart.html
/a/auto-s/bestelauto-s/m1457499260-renault-master-l3h2-2018-airco-camera-cruise-laadruimte-12.html
/a/auto-s/land-rover/m1458110209-land-rover-discovery-4-3-0-tdv6-2010-grijs.html
/a/auto-s/dodge/a1279463634-5-jaar-ram-dealer-garantie-lage-bijtelling.html
/a/auto-s/bmw/m1455389317-bmw-320i-e46-sedan-bieden.html
/a/auto-s/ford/m1457306473-ford-galaxy-2-0-tdci-85kw-dpf-2011-blauw.html
/a/auto-s/peugeot/m1456912876-peugeot-407-2-0-16v-sw-2006-grijs.html
/a/auto-s/bestelauto-s/m1457161395-renault-master-t35-2-3-dci-l3h2-130-pk-navi-airco-camera-pdc.html
/a/auto-s/bestelauto-s/a1299134880-citroen-berlingo-1-6-hdi-2017-airco-sd-3-zits-v-a-179-p-m.html
/a/auto-s/hyundai/m1458105451-hyundai-atos-gezocht-hoge-prijs-tel-0653222206.html
/a/auto-s/volkswagen/m1458103618-volkswagen-polo-1-4-tsi-132kw-dsg-2012-wit.html
/a/auto-s/vrachtwagens/m1458101965-scania-torpedo.html
/a/auto-s/toyota/m1458101624-toyota-yaris-1-0-12v-vvt-i-aspiration-5dr-2012.html
/a/auto-s/dodge/a1279447576-5-jaar-ram-dealer-garantie-en-historie-bekijk-onze-website.html
You can also build the actual url of the page by appending 'https://www.marktplaats.nl' to li.a.get('href'). So, your whole code should look like this:
import requests
from bs4 import BeautifulSoup
def parse_links(url):
links = []
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for li in soup.find_all(class_="mp-Listing mp-Listing--list-item"):
links.append('https://www.marktplaats.nl' + li.a.get('href'))
return links
url = "https://www.marktplaats.nl/l/auto-s/#f:10882,10898|PriceCentsTo:350000|constructionYearFrom:2001|offeredSince:TODAY|searchInTitleAndDescription:true"
links = parse_links(url)
print('\n'.join(map(str, links)))
It should produce the output like this:
https://www.marktplaats.nl/a/auto-s/renault/a1302508082-mooi-renault-megane-scenic-1-6-16v-aut-2005-2003-groen-airco.html
https://www.marktplaats.nl/a/auto-s/oldtimers/a1302359157-morris-minor-cabriolet-1970.html
https://www.marktplaats.nl/a/auto-s/oldtimers/a1302743902-online-veiling-oldtimers-en-classic-cars-zedelgem-vavato.html
https://www.marktplaats.nl/a/auto-s/oldtimers/a1302359138-mercedes-benz-g-500-guard-pantzer-1999.html
https://www.marktplaats.nl/a/auto-s/volkswagen/m1457703674-golf-6-1-2tsi-comfortline-bluemotion-77kw-2de-eigenaar.html
https://www.marktplaats.nl/a/auto-s/peugeot/m1457564187-peugeot-208-1-6-e-hdi-68kw-92pk-5-d-2014-zwart.html
https://www.marktplaats.nl/a/auto-s/volkswagen/m1457124365-volkswagen-touareg-3-2-v6-177kw-4motion-aut-2004-grijs.html
https://www.marktplaats.nl/a/auto-s/volkswagen/m1456753596-volkswagen-golf-vii-2-0-tdi-highline-150pk-xenon-trekhaak.html
https://www.marktplaats.nl/a/auto-s/volkswagen/a1279696849-vw-take-up-5-d-radio-airco-private-lease.html
https://www.marktplaats.nl/a/auto-s/bestelauto-s/m940111355-bus-verkopen-bestelauto-inkoop-bestelwagen-opkoper-rdw.html
https://www.marktplaats.nl/a/auto-s/volkswagen/m1456401063-volkswagen-golf-1-6-74kw-2000-zwart.html
https://www.marktplaats.nl/a/auto-s/renault/m1456242548-renault-espace-2-0-dci-110kw-e4-2006-zwart.html
https://www.marktplaats.nl/a/auto-s/nissan/m1448699345-nissan-qashqai-1-5-dci-connect-2011-grijs-panoramadak.html
https://www.marktplaats.nl/a/auto-s/citroen/a1277007710-citroen-c1-feel-5-d-airco-private-lease-vanaf-189-euro-mnd.html
https://www.marktplaats.nl/a/auto-s/bmw/m1452641019-bmw-5-serie-2-0-520d-touring-aut-2014-grijs.html
https://www.marktplaats.nl/a/auto-s/mercedes-benz/m1448671698-mercedes-benz-a-klasse-a250-amg-224pk-7g-dct-panoramadak-wid.html
https://www.marktplaats.nl/a/auto-s/bmw/m1455671862-bmw-3-serie-2-0-i-320-cabrio-aut-2007-bruin.html
https://www.marktplaats.nl/a/auto-s/bestelauto-s/m1455562699-volkswagen-transporter-kmstand-151-534-2-5-tdi-65kw-2002.html
https://www.marktplaats.nl/a/auto-s/peugeot/a1298813052-private-lease-occasion-outlet-prive-lease.html
https://www.marktplaats.nl/a/auto-s/audi/m1458114563-audi-a4-2-0-tfsi-132kw-avant-multitronic-nl-auto.html
https://www.marktplaats.nl/a/auto-s/mercedes-benz/m1452983872-mercedes-a-klasse-2-0-cdi-a200-5drs-aut-2007-grijs.html
https://www.marktplaats.nl/a/auto-s/bestelauto-s/m1457499260-renault-master-l3h2-2018-airco-camera-cruise-laadruimte-12.html
https://www.marktplaats.nl/a/auto-s/infiniti/m1458111256-infiniti-q50-3-5-hybrid-awd-2016-grijs.html
https://www.marktplaats.nl/a/auto-s/bestelauto-s/a1001658686-200-nw-en-gebruikte-bestelwagens-personenbusjes-pick-ups.html
https://www.marktplaats.nl/a/auto-s/ford/m1458111166-ford-ka-1-3-i-44kw-2007-zwart.html
https://www.marktplaats.nl/a/auto-s/land-rover/m1458110209-land-rover-discovery-4-3-0-tdv6-2010-grijs.html
https://www.marktplaats.nl/a/auto-s/bmw/m1455389317-bmw-320i-e46-sedan-bieden.html
https://www.marktplaats.nl/a/auto-s/bestelauto-s/m1457161395-renault-master-t35-2-3-dci-l3h2-130-pk-navi-airco-camera-pdc.html
https://www.marktplaats.nl/a/auto-s/renault/a1302508082-mooi-renault-megane-scenic-1-6-16v-aut-2005-2003-groen-airco.html
https://www.marktplaats.nl/a/auto-s/ford/m1457306473-ford-galaxy-2-0-tdci-85kw-dpf-2011-blauw.html
https://www.marktplaats.nl/a/auto-s/peugeot/m1456912876-peugeot-407-2-0-16v-sw-2006-grijs.html
https://www.marktplaats.nl/a/auto-s/hyundai/m1458105451-hyundai-atos-gezocht-hoge-prijs-tel-0653222206.html
https://www.marktplaats.nl/a/auto-s/volkswagen/m1458103618-volkswagen-polo-1-4-tsi-132kw-dsg-2012-wit.html
https://www.marktplaats.nl/a/auto-s/oldtimers/a1302743902-online-veiling-oldtimers-en-classic-cars-zedelgem-vavato.html
Good luck!

Get links from a site's homepage using python

I want to write a script to get a home page's links to social media (twitter / facebook mostly), and I'm completely stuck since I am fairly new to Python.
The task I want to accomplish is to parse the website, find the social media links, and save it in a new data frame where each column would contain the original URL, the twitter link, and the facebook link. Here's what I have so far of this code for the new york times website:
from bs4 import BeautifulSoup
import requests
url = "http://www.nytimes.com"
r = requests.get(url)
sm_sites = ['twitter.com','facebook.com']
soup = BeautifulSoup(r.content, 'html5lib')
all_links = soup.find_all('a', href = True)
for site in sm_sites:
if all(site in sm_sites for link in all_links):
print(site)
else:
print('no link')
I'm having some problems understanding what the loop is doing, or how to make it work for what I need it to. I also had tried to store the site instead of doing print(site) but that was not working... So I figured I'd ask for help. Before asking, I went through a bunch of responses here but none could get me to do what I needed to do.
the way this code works, you already have your links. Your homepage link is the starting url, so http://www.nytimes.com.
And you have the social media urls sm_sites = ['twitter.com','facebook.com'], all you're doing is confirming they exist on the main page. If you want to save the list of confirmed social media urls, then append them to a list
Here is one way to get the social media links off a page
import requests
from bs4 import BeautifulSoup
url = "https://stackoverflow.com/questions/tagged/python"
r = requests.get(url)
sm_sites = ['twitter.com','facebook.com']
sm_sites_present = []
soup = BeautifulSoup(r.content, 'html5lib')
all_links = soup.find_all('a', href = True)
for sm_site in sm_sites:
for link in all_links:
if sm_site in link.attrs['href']:
sm_sites_present.append(link.attrs['href'])
print(sm_sites_present)
output:
['https://twitter.com/stackoverflow', 'https://www.facebook.com/officialstackoverflow/']
Update
for a df of urls
import requests
import pandas as pd
from bs4 import BeautifulSoup
from IPython.display import display
urls = [
"https://stackoverflow.com/questions/tagged/python",
"https://www.nytimes.com/",
"https://en.wikipedia.org/"
]
sm_sites = ['twitter.com','facebook.com']
sm_sites_present = []
columns = ['url'] + sm_sites
df = pd.DataFrame(data={'url' : urls}, columns=columns)
def get_sm(row):
r = requests.get(row['url'])
output = pd.Series()
soup = BeautifulSoup(r.content, 'html5lib')
all_links = soup.find_all('a', href = True)
for sm_site in sm_sites:
for link in all_links:
if sm_site in link.attrs['href']:
output[sm_site] = link.attrs['href']
return output
sm_columns = df.apply(get_sm, axis=1)
df.update(sm_columns)
df.fillna(value='no link')
output
This will do what you want with regards to adding it to a DataFrame. You can iterate through a list of websites (urlsToSearch), adding a row to the dataframe for each one containing the base website, all facebook links, and all twitter links.
from bs4 import BeautifulSoup
import requests
import pandas as pd
df = pd.DataFrame(columns=["Website", "Facebook", "Twitter"])
urlsToSearch = ["http://www.nytimes.com","http://www.businessinsider.com/"]
for url in urlsToSearch:
r = requests.get(url)
tw_links = []
fb_links = []
soup = BeautifulSoup(r.text, 'html.parser')
all_links = [link['href'] for link in soup.find_all('a', href = True)] #only get href
for link in all_links:
if "twitter.com" in link:
tw_links.append(link)
elif "facebook.com" in link:
fb_links.append(link)
df.loc[df.shape[0]] = [url,fb_links,tw_links] #Add row to end of df

Web crawler - following links

Please bear with me. I am quite new at Python - but having a lot of fun. I am trying to code a web crawler that crawls through election results from the last referendum in Denmark. I have managed to extract all the relevant links from the main page. And now I want Python to follow each of the 92 links and gather 9 pieces of information from each of those pages. But I am so stuck. Hope you can give me a hint.
Here is my code:
import requests
import urllib2
from bs4 import BeautifulSoup
# This is the original url http://www.kmdvalg.dk/
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
my_list = []
all_links = soup.find_all("a")
for link in all_links:
link2 = link["href"]
my_list.append(link2)
for i in my_list[1:93]:
print i
# The output shows all the links that I would like to follow and gather information from. How do I do that?
Here is my solution using lxml. It's similar to BeautifulSoup
import lxml
from lxml import html
import requests
page = requests.get('http://www.kmdvalg.dk/main')
tree = html.fromstring(page.content)
my_list = tree.xpath('//div[#class="LetterGroup"]//a/#href') # grab all link
print 'Length of all links = ', len(my_list)
my_list is a list consist of all links. And now you can use for loop to scrape information inside each page.
We can for loop through each links. Inside each page, you can extract information as example. This is only for the top table.
table_information = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
table_key = tree.xpath('//td[#class="statusHeader"]/text()')
table_value = tree.xpath('//td[#class="statusText"]/text()') + tree.xpath('//td[#class="statusText"]/a/text()')
table_information.append(zip([t]*len(table_key), table_key, table_value))
For table below the page,
table_information_below = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
l1 = tree.xpath('//tr[#class="tableRowPrimary"]/td[#class="StemmerNu"]/text()')
l2 = tree.xpath('//tr[#class="tableRowSecondary"]/td[#class="StemmerNu"]/text()')
table_information_below.append([t]+l1+l2)
Hope this help!
A simple approach would be to iterate through your list of urls and parse them each individually:
for url in my_list:
soup = BeautifulSoup(urllib2.urlopen(url).read())
# then parse each page individually here
Alternatively, you could speed things up significantly using Futures.
from requests_futures.sessions import FuturesSession
def my_parse_function(html):
"""Use this function to parse each page"""
soup = BeautifulSoup(html)
all_paragraphs = soup.find_all('p')
return all_paragraphs
session = FuturesSession(max_workers=5)
futures = [session.get(url) for url in my_list]
page_results = [my_parse_function(future.result()) for future in results]
This would be my solution for your problem
import requests
from bs4 import BeautifulSoup
def spider():
url = "http://www.kmdvalg.dk/main"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('div', {'class': 'LetterGroup'}):
anc = link.find('a')
href = anc.get('href')
print(anc.getText())
print(href)
# spider2(href) call a second function from here that is similar to this one(making url = to herf)
spider2(href)
print("\n")
def spider2(linktofollow):
url = linktofollow
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('tr', {'class': 'tableRowPrimary'}):
anc = link.find('td')
print(anc.getText())
print("\n")
spider()
its not done... i only get a simple element from the table but you get the idea and how its supposed to work.
Here is my final code that works smooth. Please let me know if I could have done it smarter!
import urllib2
from bs4 import BeautifulSoup
import codecs
f = codecs.open("eu2015valg.txt", "w", encoding="iso-8859-1")
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
liste = []
alle_links = soup.find_all("a")
for link in alle_links:
link2 = link["href"]
liste.append(link2)
for url in liste[1:93]:
soup = BeautifulSoup(urllib2.urlopen(url).read().decode('iso-8859-1'))
tds = soup.findAll('td')
stemmernu = soup.findAll('td', class_='StemmerNu')
print >> f, tds[5].string,";",tds[12].string,";",tds[14].string,";",tds[16].string,";", stemmernu[0].string,";",stemmernu[1].string,";",stemmernu[2].string,";",stemmernu[3].string,";",stemmernu[6].string,";",stemmernu[8].string,";",'\r\n'
f.close()

Categories