A newbie scraper here !
I am currently indulged in a tedious and boring task where I have to copy/paste certain contents from Angel List and save them in excel. I have previously used scrapers to automate such boring tasks but this one is quite tough and I am unable to find a way to automate it. Please find below the website link:
https://angel.co/people/all
Kindly apply filters Location-> USA, and Market-> Online Dating. There will be around 550 results (please note that the URL doesn't change when you apply the filters)
I have successfully scraped the URLs of all the profiles once filters are applied. Therefore, I have an excel file with 550 URLs of these profiles.
Now the next step is to go to individual profiles and scrape certain information. I am looking for these fields currently:
Name
Description Information
Investments
Founder
Advisor
Locations
Markets
What I'm looking for
Now I have tried a lot of solutions but none have worked so far. Import.io, data miner, data scraper tools are not helping me much.
Please suggest is there any VBA code or Python code or any tool that can help me to automate this scraping task?
COMPLETE CODE FOR SOLUTION:
Here is the final code with comments. If someone still has problems, please comment below and I will try to help you out.
from bs4 import BeautifulSoup
import urllib2
import json
import csv
def fetch_page(url):
opener = urllib2.build_opener()
# changing the user agent as the default one is banned
opener.addheaders = [('User-Agent', 'Mozilla/43.0.1')]
return opener.open(url).read()
#Create a CSV File.
f = open('angle_profiles.csv', 'w')
# Row Headers
f.write("URL" + "," + "Name" + "," + "Founder" + "," + "Advisor" + "," + "Employee" + "," + "Board Member" + ","
+ "Customer" + "," + "Locations" + "," + "Markets" + "," + "Investments" + "," + "What_iam_looking_for" + "\n")
# URLs to iterate over has been saved in file: 'profiles_links.csv' . I will extract the URLs individually...
index = 1;
with open("profiles_links.csv") as f2:
for row in map(str.strip,f2):
url = format(row)
print "# Index: ", index
index += 1;
# Check if URL has 404 error. if yes, skip and continue with the rest of URLs.
try:
html = fetch_page(url)
page = urllib2.urlopen(url)
except Exception, e:
print "Error 404 #: " , url
continue
bs = BeautifulSoup(html, "html.parser")
#Extract info from page with these tags..
name = bs.select(".profile-text h1")[0].get_text().strip()
#description = bs.select('div[data-field="bio"]')[0]['data-value']
founder = map(lambda link: link.get_text().strip(), bs.select('.role_founder a'))
advisor = map(lambda link: link.get_text().strip(), bs.select('.role_advisor a'))
employee = map(lambda link: link.get_text().strip(), bs.select('.role_employee a'))
board_member = map(lambda link: link.get_text().strip(), bs.select('.role_board_member a'))
customer = map(lambda link: link.get_text().strip(), bs.select('.role_customer a'))
class_wrapper = bs.body.find('div', attrs={'data-field' : 'tags_interested_locations'})
count = 1
locations = {}
if class_wrapper is not None:
for span in class_wrapper.find_all('span'):
locations[count] = span.text
count +=1
class_wrapper = bs.body.find('div', attrs={'data-field' : 'tags_interested_markets'})
count = 1
markets = {}
if class_wrapper is not None:
for span in class_wrapper.find_all('span'):
markets[count] = span.text
count +=1
what_iam_looking_for = ' '.join(map(lambda p: p.get_text().strip(), bs.select('div.criteria p')))
user_id = bs.select('.profiles-show .profiles-show')[0]['data-user_id']
# investments are loaded using separate request and response is in JSON format
json_data = fetch_page("https://angel.co/startup_roles/investments?user_id=%s" % user_id)
investment_records = json.loads(json_data)
investments = map(lambda x: x['company']['company_name'], investment_records)
# Make sure that every variable is in string
name2 = str(name); founder2 = str(founder); advisor2 = str (advisor); employee2 = str(employee)
board_member2 = str(board_member); customer2 = str(customer); locations2 = str(locations); markets2 = str (markets);
what_iam_looking_for2 = str(what_iam_looking_for); investments2 = str(investments);
# Replace any , found with - so that csv doesn't confuse it as col separator...
name = name2.replace(",", " -")
founder = founder2.replace(",", " -")
advisor = advisor2.replace(",", " -")
employee = employee2.replace(",", " -")
board_member = board_member2.replace(",", " -")
customer = customer2.replace(",", " -")
locations = locations2.replace(",", " -")
markets = markets2.replace(",", " -")
what_iam_looking_for = what_iam_looking_for2.replace(","," -")
investments = investments2.replace(","," -")
# Replace u' with nothing
name = name.replace("u'", "")
founder = founder.replace("u'", "")
advisor = advisor.replace("u'", "")
employee = employee.replace("u'", "")
board_member = board_member.replace("u'", "")
customer = customer.replace("u'", "")
locations = locations.replace("u'", "")
markets = markets.replace("u'", "")
what_iam_looking_for = what_iam_looking_for.replace("u'", "")
investments = investments.replace("u'", "")
# Write the information back to the file... Note \n is used to jump one row ahead...
f.write(url + "," + name + "," + founder + "," + advisor + "," + employee + "," + board_member + ","
+ customer + "," + locations + "," + markets + "," + investments + "," + what_iam_looking_for + "\n")
Feel free to test the above code with any of the following links:
https://angel.co/idg-ventures?utm_source=people
https://angel.co/douglas-feirstein?utm_source=people
https://angel.co/andrew-heckler?utm_source=people
https://angel.co/mvklein?utm_source=people
https://angel.co/rajs1?utm_source=people
HAPPY CODING :)
For my recipe you will need to install BeautifulSoup using pip or easy_install
from bs4 import BeautifulSoup
import urllib2
import json
def fetch_page(url):
opener = urllib2.build_opener()
# changing the user agent as the default one is banned
opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
return opener.open(url).read()
html = fetch_page("https://angel.co/davidtisch")
# or load from local file
#html = open('page.html', 'r').read()
bs = BeautifulSoup(html, "html.parser")
name = bs.select(".profile-text h1")[0].get_text().strip()
description = bs.select('div[data-field="bio"]')[0]['data-value']
founder = map(lambda link: link.get_text().strip(), bs.select('.role_founder a'))
advisor = map(lambda link: link.get_text().strip(), bs.select('.role_advisor a'))
locations = map(lambda link: link.get_text().strip(), bs.select('div[data-field="tags_interested_locations"] a'))
markets = map(lambda link: link.get_text().strip(), bs.select('div[data-field="tags_interested_markets"] a'))
what_iam_looking_for = ' '.join(map(lambda p: p.get_text().strip(), bs.select('div.criteria p')))
user_id = bs.select('.profiles-show .profiles-show')[0]['data-user_id']
# investments are loaded using separate request and response is in JSON format
json_data = fetch_page("https://angel.co/startup_roles/investments?user_id=%s" % user_id)
investment_records = json.loads(json_data)
investments = map(lambda x: x['company']['company_name'], investment_records)
Take a look at https://scrapy.org/
It allows write parser very quickly. Here's my example parser for one site alike angel.co: https://gist.github.com/lisitsky/c4aac52edcb7abfd5975be067face1bb
Unfortunately, angel.co is not available for me now. Good point to start:
$ pip install scrapy
$ cat > myspider.py <<EOF
import scrapy
class BlogSpider(scrapy.Spider):
name = 'blogspider'
start_urls = ['https://angel.co']
def parse(self, response):
# here's selector to extract interesting elements
for title in response.css('h2.entry-title'):
# write down here values you'd like to extract from the element
yield {'title': title.css('a ::text').extract_first()}
# how to find next page
next_page = response.css('div.prev-post > a ::attr(href)').extract_first()
if next_page:
yield scrapy.Request(response.urljoin(next_page), callback=self.parse)
EOF
$ scrapy runspider myspider.py
Enter interesting css-selectors and run spider.
Related
I am currently new to web scraping in Python with BeautifulSoup and am attempting to gather news articles from the Sudan Tribune for a Human Rights project. The body text is contained in 'span' tags and my end goal is to filter out all articles that do not contain news about killings or HR violations. My question is, how can I refer to a specific body of text when each one is contained in a tag called 'span' with no class or id to differentiate it from the others.
My code so far gets me the links and body text of each article but I don't know how to call a specific one, only all of them at once. Ideally, Id like to be able to quickly refer to a specific article's body text and tell Python to either include it or not based on my own criteria.
URL = 'https://www.sudantribune.com/spip.php?rubrique1'
Source = requests.get(URL)
Soup = BeautifulSoup(Source.content, 'html.parser')
print("You are current crawling the website -> " + URL)
links = []
for link in Soup.find_all('a'):
links.append(link.get('href'))
print("The links to the articles from " + URL + " are:")
print("https://www.sudantribune.com/" + links[45] + "\n" + "https://www.sudantribune.com/" + links[46] +
"\n" + "https://www.sudantribune.com/" + links[47] + "\n" + "https://www.sudantribune.com/" +
links[48] + "\n" + "https://www.sudantribune.com/" + links[49] + "\n" + "https://www.sudantribune.com/" + links[50]+
"\n" + "https://www.sudantribune.com/" + links[51]+ "\n" + "https://www.sudantribune.com/" + links[52] + "\n" +
"https://www.sudantribune.com/" + links[53] + "\n" + "https://www.sudantribune.com/" + links[54])
Descriptions = Soup.find_all('span')
print(Descriptions)
I have only been working with Python for a week now so any suggestions are greatly appreciated
Do you want to retrieve the span from the different web pages living at different urls? If so, for each url you need to repeat the initial process of 'Getting' the data from that page and investigating.
URL = 'https://www.sudantribune.com/spip.php?rubrique1'
Source = requests.get(URL)
Soup = BeautifulSoup(Source.content, 'html.parser')
print("You are current crawling the website -> " + URL)
links = []
for link in Soup.find_all('a'):
SubPage = requests.get("https://www.sudantribune.com/" + link)
SubSoup = BeautifulSoup(SubSource.content, 'html.parser')
Descriptions = Soup.find_all('span')
print(Descriptions)
if SOME_CONDITION_YOU_SPECIFY:
links.append(link.get('href')) # Only append if it meets your criteria
If I was going to do it, I would do something like this
for story in Soup.find_all("li"):
span = story.find("span") # might even be able to do "story.span"
if is_this_interesting(span.text):
store_interesting_article(....)
I'm trying to webscrape https://old.reddit.com/r/all/ and get the entries on the first page.
When I run my code, it works but for the post_text it only copies the last post on the reddit page 25 times. I know this is because its getting the entry and then posting it each time through the loop.
import requests
import urllib.request
from bs4 import BeautifulSoup as soup
my_url = 'https://old.reddit.com/r/all/'
request = urllib.request.Request(my_url,headers={'User-Agent': 'your bot 0.1'})
response = urllib.request.urlopen(request)
page_html = response.read()
page_soup = soup(page_html, "html.parser")
posts = page_soup.findAll("div", {"class": "top-matter"})
post = posts[0]
authors = page_soup.findAll("p", {"class":"tagline"})
author = authors[0]
filename = "redditAll.csv"
f = open(filename, "w")
headers = "Title of the post, Author of the post\n"
f.write(headers)
for post in posts:
post_text = post.p.a.text.replace(",", " -")
for author in authors:
username = author.a.text
f.write(post_text + "," + username + "\n")
f.close()
Changed this
for post in posts:
post_text = post.p.a.text.replace(",", " -")
for author in authors:
username = author.a.text
To that
for post, author in zip(posts, authors):
post_text = post.p.a.text.replace(",", " -")
username = author.a.text
LTheriault is correct, but I'd consider this more idiomatic.
for post, author in zip(posts, authors):
post_text = post.p.a.text.replace(",", " -")
username = author.a.text
f.write(post_text + "," + username + "\n")
You're doing the two loops separately. In your code below, you're looping through each post and assigning a string to post_text, but doing nothing else with it. When that loop is done, post_text is the last thing it has been assigned as before it moves into the authors loop and writes a string with each author and the string you have stored in post_text.
for post in posts:
post_text = post.p.a.text.replace(",", " -")
for author in authors:
username = author.a.text
f.write(post_text + "," + username + "\n")
Assuming that there are an equal number of elements in posts and authors, you should be able to fix it with the following:
for i in range(len(posts)):
post_text = posts[i].p.a.text.replace(",", " -")
username = authors[i].a.text
f.write(post_text + "," + username + "\n")
The problem here is that you're writing to the file object within the scope of the
of the second for loop for author in authors, so you will indeed write the last value of post_text multiple times.
If you want to combine authors and posts you might zip them and them iterate over them (assuming they are the same length)
for author, post in zip(posts, authors):
write.(f 'author: {author}, post: {post}')
I would also recommend to write to file using a context manager
eg.
with open('filename.txt', 'w') as f:
f.write('stuff')
I have written a web scraper according to a youtube vid. It gives me just one container from all 48 containers.
Why isn't my code looping through all the containers automatically? What did I miss here?
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.tradera.com/search?itemStatus=Ended&q=iphone+6+-6s+64gb+-plus'
#
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#Container
containers = page_soup.findAll("div",{"class":"item-card-details"})
filename = "ip6.csv"
f = open(filename, "w")
headers = "title, link, price, bids\n"
f.write(headers)
for container in containers:
title = container.div.div.h3["title"]
link = container.div.div.h3.a["href"]
price_container = container.findAll("span",{"class":"item-card-details-price-amount"})
price = price_container[0].text
bid_container = container.findAll("span",{"class":"item-card-details-bids"})
bids = bid_container[0].text
print("title: " + title)
print("link: " + link)
print("price: " + price)
print("bids: " + bids)
f.write(title + "," + link + "," + price + "," + bids + "\n")
f.close
Because the loop is "empty". In python you have to indent the block of code that should run inside the loop, e.g.:
for i in loop:
# do something
In your code:
for container in containers:
title = container.div.div.h3["title"]
link = container.div.div.h3.a["href"]
price_container = container.findAll("span",{"class":"item-card-details-price-amount"})
price = price_container[0].text
bid_container = container.findAll("span",{"class":"item-card-details-bids"})
bids = bid_container[0].text
print("title: " + title)
print("link: " + link)
print("price: " + price)
print("bids: " + bids)
f.write(title + "," + link + "," + price + "," + bids + "\n")
f.close
You asked me what was going on and why I get the correct result. Below the script adjusted for py 3.5. As it appears some error occurs at the print line. I by accident almost fixed your script in your question itself.
As Ilja pointed out there were indentation errors and its correct he mentioned empty list returns... prior to my accidental partial fix. What I missed out in the accidental fix was not bringing in the print statements into the for-loop. So I get one result. Checking the web-page... you want to collect all phone products.
Below script fixes all the issues by having the print-statements inside the for-loop. Thus in your Pycharm standard output you should now hget many blocks of printed products. And fixing your file wire should show similar result in the csv-file.
Py3.5+ is a little bit childish when it comes to print ('title' + title`). IMO... style py2.x should have been kept as it gives more flexibility and reduces RSI by less typing. Anyway, the iteration through this phone web-page should work now like a pyCharm..
repr comment : no you didn't use repr at all and its not needed but....
For print syntax examples check here and for the official python docs here.
In addition, I've added some formatting code for your output-file. It should be now in columns... and readable. Enjoy!
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.tradera.com/search?itemStatus=Ended&q=iphone+6+-6s+64gb+-plus'
#
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#Container
containers = page_soup.findAll("div",{"class":"item-card-details"})
filename = "ip6.csv"
f = open(filename, "w")
headers = "title, link, price, bids\n"
f.write(headers)
l1 = 0
l2 = 0
l3 = 0
# get longest entry per item for string/column-formatting
for container in containers:
title = container.div.div.h3["title"]
t = len(title)
if t > l1:
l1 = t
link = container.div.div.h3.a["href"]
price_container = container.findAll("span",{"class":"item-card-details-price-amount"})
price = price_container[0].text
p = len(price)
if p > l2:
l2 = p
bid_container = container.findAll("span",{"class":"item-card-details-bids"})
bids = bid_container[0].text
b = len(bids)
if b > l3:
l3 = b
for container in containers:
title = container.div.div.h3["title"]
link = container.div.div.h3.a["href"]
price_container = container.findAll("span",{"class":"item-card-details-price-amount"})
price = price_container[0].text
bid_container = container.findAll("span",{"class":"item-card-details-bids"})
bids = bid_container[0].text
# claculate distances between columns
d1 = l1-len(title) + 0
d2 = l2-len(price) + 1
d3 = l3-len(bids) + 1
d4 = 2
print("title : %s-%s %s." % (l1, d1, title))
print("price : %s-%s %s." % (l2, d2, price))
print("bids : %s-%s %s." % (l3, d3, bids))
print("link : %s." % link)
f.write('%s%s, %s%s, %s%s, %s%s\n' % (title, d1* ' ', d2* ' ', price, d3 * ' ', bids, d4 * ' ', link))
f.close
Thank you all for helping me solve this. It was the indentation of the print lines. You are the best!
help, I'm grabbing multiple href url links from a website and trying to append each title and body text of the urls into another array. however when I run something similar to this, I only grab one title, with all the text of the other links together.
request = requests.get(url)
somecontents = request.content
soup = BeautifulSoup(somecontents, "html.parser")
soup.prettify()
gethref = urllinks.get("href")
if gethref is not None and\
"http" in gethref and\
"photo" not in gethref and\
"img" not in gethref:
page_links = []
tags_in_link = gethref
page_links.append(tags_in_link)
hrefdataset = ','.join(page_links)
for each_link in i:
website_header_title = soup.title.string
parse_title = re.sub('[^A-Za-z]+', ' ', website_header_title)
time.sleep(.05)
done = grab_web_text(each_link)
testintry = []
testintry.append("Website Title: " + parse_title + "," + " ")
text = testintry.append("Body: " + done)
I would like each link in: how can I format it into this from what I have?
[{"Website Title: " "title", "Body: " "Body},
[{"Website Title: " "title", "Body: " "Body},
[{"Website Title: " "title", "Body: " "Body},
[{"Website Title: " "title", "Body: " "Body}]
You can create a list of dictionaries like this:
def get_link_info(l):
parse_title = re.sub('[^A-Za-z]+', ' ', website_header_title)
done = grab_web_text(each_link)
return (parse_title, done)
print([{t: d} for t, d in (get_link_info(i) for i in links)])
How does this work?
for i in links is a loop over all the links.
get_link_info returns a tuple containing title and `done
for t, d in (...) is a loop over resulting tuples
{t: d} for t, d in (...) is a dict comprehension
Outer [] creates a list from generator.
I'm having a bit of trouble automatically scraping data in a table from a Wikipedia article. First I was getting an encoding error. I specified UTF-8 and the error went away, but the scraped data doesn't display a lot of the characters correctly. You will be able to tell from the code that I am a complete newbie:
from bs4 import BeautifulSoup
import urllib2
wiki = "http://en.wikipedia.org/wiki/Anderson_Silva"
header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia
req = urllib2.Request(wiki,headers=header)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
Result = ""
Record = ""
Opponent = ""
Method = ""
Event = ""
Date = ""
Round = ""
Time = ""
Location = ""
Notes = ""
table = soup.find("table", { "class" : "wikitable sortable" })
f = open('output.csv', 'w')
for row in table.findAll("tr"):
cells = row.findAll("td")
#For each "tr", assign each "td" to a variable.
if len(cells) == 10:
Result = cells[0].find(text=True)
Record = cells[1].find(text=True)
Opponent = cells[2].find(text=True)
Method = cells[3].find(text=True)
Event = cells[4].find(text=True)
Date = cells[5].find(text=True)
Round = cells[6].find(text=True)
Time = cells[7].find(text=True)
Location = cells[8].find(text=True)
Notes = cells[9].find(text=True)
write_to_file = Result + "," + Record + "," + Opponent + "," + Method + "," + Event + "," + Date + "," + Round + "," + Time + "," + Location + "\n"
write_to_unicode = write_to_file.encode('utf-8')
print write_to_unicode
f.write(write_to_unicode)
f.close()
As pswaminathan pointed out, using the csv module will help greatly. Here is how I do it:
table = soup.find('table', {'class': 'wikitable sortable'})
with open('out2.csv', 'w') as f:
csvwriter = csv.writer(f)
for row in table.findAll('tr'):
cells = [c.text.encode('utf-8') for c in row.findAll('td')]
if len(cells) == 10:
csvwriter.writerow(cells)
Discussion
Using the csv module, I created a csvwriter object connected to my output file.
By using the with command, I don't need to worry about closing the output file after done: it will be closed after the with block.
In my code, cells is a list of UTF8-encoded text extracted from the td tags within a tr tag.
I used the construct c.text, which is more concise than c.find(text=True).