Strategy for scraping web pages, maximizing information gathered - python

Here's the problem:
Users register for a site and can pick one of 8 job categories, or choose to skip this step. I want to classify the users who've skipped that step into job categories, based on the domain name in their email address.
Current setup:
Using a combination of Beautiful Soup and nltk, I scrape the homepage and look for links to pages on the site that contain the word "about". I scrape that page, too. I've copied the bit of code that does the scraping at the end of this post.
The issue:
I'm not getting enough data to get a good learning routine in place. I'd like to know if my scraping algorithm is set up for success--in other words, are there any gaping holes in my logic, or any better way to ensure that I have a good chunk of text that describes what kind of work a company does?
The (relevant) code:
import bs4 as bs
import httplib2 as http
import nltk
# Only these characters are valid in a url
ALLOWED_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]#!$&'()*+,;="
class WebPage(object):
def __init__(self, domain):
"""
Constructor
:param domain: URL to look at
:type domain: str
"""
self.url = 'http://www.' + domain
try:
self._get_homepage()
except: # Catch specific here?
self.homepage = None
try:
self._get_about_us()
except:
self.about_us = None
def _get_homepage(self):
"""
Open the home page, looking for redirects
"""
import re
web = http.Http()
response, pg = web.request(self.url)
# Check for redirects:
if int(response.get('content-length',251)) < 250:
new_url = re.findall(r'(https?://\S+)', pg)[0]
if len(new_url): # otherwise there's not much I can do...
self.url = ''.join(x for x in new_url if x in ALLOWED_CHARS)
response, pg = web.request(self.url)
self.homepage = self._parse_html(nltk.clean_html(pg))
self._raw_homepage = pg
def _get_about_us(self):
"""
Soup-ify the home page, find the "About us" page, and store its contents in a
string
"""
soup = bs.BeautifulSoup(self._raw_homepage)
links = [x for x in soup.findAll('a') if x.get('href', None) is not None]
about = [x.get('href') for x in links if 'about' in x.get('href', '').lower()]
# need to find about or about-us
about_us_page = None
for a in about:
bits = a.strip('/').split('/')
if len(bits) == 1:
about_us_page = bits[0]
elif 'about' in bits[-1].lower():
about_us_page = bits[-1]
# otherwise assume shortest string is top-level about pg.
if about_us_page is None and len(about):
about_us_page = min(about, key=len)
self.about_us = None
if about_us_page is not None:
self.about_us_url = self.url + '/' + about_us_page
web = http.Http()
response, pg = web.request(self.about_us_url)
if int(response.get('content-length', 251)) > 250:
self.about_us = self._parse_html(nltk.clean_html(pg))
def _parse_html(self, raw_text):
"""
Clean html coming from a web page. Gets rid of
- all '\n' and '\r' characters
- all zero length words
- all unicode characters that aren't ascii (i.e., &...)
"""
lines = [x.strip() for x in raw_text.splitlines()]
all_text = ' '.join([x for x in lines if len(x)]) # zero length strings
return [x for x in all_text.split(' ') if len(x) and x[0] != '&']

It is outside of what you are asking, but I would look at calling an external data source that has already collected this information. A good place to find such a service would be on the Programmable Web (for instance Mergent Company Fundamentals). Not all the data on Programmable Web is up-to-date but it seems like a lot of API providers are out there.

Related

Problems with text data-cleaning in python

I am working on a program that crawls Internet articles using the web crawling method.The program is started by entering the start and end pages of the website.
This program works in the following order.
web-crawling of articles information(title, sort, time, contents)
Remove special characters
Only nouns are extracted.
The problem maybe occurs lies in extracting nouns in the process of cleaning the content of the article. It works until the stage before noun extraction.
The error message is as follows
ValueError: Length of passed values is 4, index implies 5
To solve this problem, I coded using a method of adding DataFrame append.
But it doesn't solve the problem.
Use konlypy method(Korean morpheme analyzer)
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
from konlpy.tag import Okt
from pandas import Series
i = input('Start page? : ')
k = input('End page? : ')
startpage = int(i)
lastpage = int(k)
count = int(i)
# Definition of text cleaning function
def text_cleaning(text):
hangul = re.compile('[^ㄱ-ㅣ가-힣]+')
result = hangul.sub(' ', text)
return result
# Definition of nouns extraction function
def get_nouns(x):
nouns_tagger = Okt()
nouns = nouns_tagger.nouns(x)
nouns = [noun for noun in nouns if len(noun)>1]
nouns = [noun for noun in nouns if noun not in stopwords]
return nouns
# dataframe formation
columns = ['Title', 'Sort', 'Datetime', 'Article']
news_info = pd.DataFrame(columns=columns)
idx = 0
Web-site page loop
while startpage<lastpage + 1:
url = f'http://www.koscaj.com/news/articleList.html?page={startpage}&total=72698&box_idxno=&sc_section_code=S1N2&view_type=sm'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all(class_='list-titles')
print(f'-----{count}page result-----')
# Articles loop in the web-site page
for link in links:
news_url = "http://www.koscaj.com"+link.find('a')['href']
news_link = urllib.request.urlopen(news_url).read()
soup2 = BeautifulSoup(news_link, 'html.parser')
# an article's title
title = soup2.find('div', {'class':'article-head-title'})
if title:
title = soup2.find('div', {'class':'article-head-title'}).text
else:
title = ''
# an article's sort
sorts = soup2.find('nav', {'class':'article-head-nav auto-marbtm-10'})
try:
sorts2 = sorts.find_all('a')
sort = sorts2[2].text
except:
sort =''
# an article's time
date = soup2.find('div',{'class':'info-text'})
try:
datetime = date.find('i', {'class':'fa fa-clock-o fa-fw'}).parent.text.strip()
datetime = datetime.replace("승인", "")
except:
datetime = ''
# an article's content
article = soup2.find('div', {'id':'article-view-content-div'})
if article:
article = soup2.find('div', {'id':'article-view-content-div'}).text
article = article.replace("\n", "")
article = article.replace("\r", "")
article = article.replace("\t", "")
article = article.replace("[전문건설신문] koscaj#kosca.or.kr", "")
article = article.replace("저작권자 © 대한전문건설신문 무단전재 및 재배포 금지", "")
article = article.replace("전문건설신문", "")
article = article.replace("다른기사 보기", "")
else:
article = ''
# Remove special characters
news_info['Title'] = news_info['Title'].apply(lambda x: text_cleaning(x))
news_info['Sort'] = news_info['Sort'].apply(lambda x: text_cleaning(x))
news_info['Article'] = news_info['Article'].apply(lambda x: text_cleaning(x))
So far, the program works without any problems. But if you see the program error message, it is indicated that the operation is not working because the input value and index are different.
Text data cleaning for extraction nouns
# Dataframe for storing after crawling individual articles
row = [title, sort, datetime, article]
series = pd.Series(row, index=news_info.columns)
news_info = news_info.append(series, ignore_index=True)
# Load Korean stopword dictionary file
path = "C:/Users/이바울/Desktop/이바울/코딩파일/stopwords-ko.txt"
with open(path, encoding = 'utf-8') as f:
stopwords = f.readlines()
stopwords = [x.strip() for x in stopwords]
news_info['Nouns'] = news_info['Article'].apply(lambda x: get_nouns(x))
startpage += 1
count += 1
news_info.to_excel(f'processing{lastpage-int(1)}-{startpage-int(1)}.xlsx')
print('Complete')
After setting the existing 4 columns in the Pandas DataFrame, the append was used to add the column extracted as a noun as the 5th column. I know this method adds a column regardless of the index name. And if you look at the image link at the bottom, as a result, the first article is crawled and shows the results. From the next article, it does not work and an error occurs.
enter image description here(Program error result)
enter link description here(Korean stopwords dictionary)
I solves the problem.
It depends on the location of the code in the for loop statement.
I've been able to fix the problem as a result of continuing to reposition the problematic areas except for the code that worked before.
I solved the problem by applying backspace only twice in the code below.
news_info['Nouns'] = news_info['Article'].apply(lambda x: get_nouns(x))

Web scraping: Index out of Bound (Possible scaling error)

Hi Wrote a web scraping program and it gets the ASN number correctly, but after all the data is scraped, it returns a error "Array Out if Bounds".
I am using Pycharm and latest python version. Below is my code.
There is already a similar issue on stackoverflow but I am not able to get the pieces together and make it work. (Web Scraping List Index Out Of Range) its the exact same error but I am not sure how to get it working for my List.
Error seems to be at current_country = link.split('/')[2]
Any help is appreciated. Thank you.
import urllib.request
import bs4
import re
import json
url = 'https://ipinfo.io/countries'
SITE = 'https://ipinfo.io'
def url_to_soup(url):
req = urllib.request.Request(url)
opener = urllib.request.build_opener()
html = opener.open(req)
soup = bs4.BeautifulSoup(html, "html.parser")
return soup
def find_pages(page):
pages = []
for link in page.find_all(href=re.compile('/countries')):
pages.append(link.get('href'))
return pages
def scrape_pages(links):
mappings = {}
print("Scraping Pages for ASN Data...")
for link in links:
country_page = url_to_soup(SITE + link)
current_country = link.split('/')[2]
print(current_country)
for row in country_page.find_all('tr'):
columns = row.find_all('td')
if len(columns) > 0:
current_asn = re.findall(r'\d+', columns[0].string)[0]
print(current_asn)
"""
name = columns[1].string
routes_v4 = columns[3].string
routes_v6 = columns[5].string
mappings[current_asn] = {'Country': current_country,
'Name': name,
'Routes v4': routes_v4,
'Routes v6': routes_v6}
return mappings """
main_page = url_to_soup(url)
country_links = find_pages(main_page)
#print(country_links)
asn_mappings = scrape_pages(country_links)
print(asn_mappings)
The last href contains string "/countries" in https://ipinfo.io/countries is actually "/countries":
<li>Global ASNs</li>
After splitting this link, it produced list ["", "countries"] where the third element was missing. To fix this problem, simply check the list length before retrieving the third element:
...
current_country = link.split('/')
if len(current_country) < 3:
continue
current_country = current_country[2]
...
Another solution is to exclude the last href by changing the regexp to:
...
for link in page.find_all(href=re.compile('/countries/')):
...

Search multiple pages for match

I'm trying to solve an exercise, basically, I have to parse a JSON page and search for an object. If the object is not found then I have to search the next page for it. If the person I'm looking for is on the first page then I pass the test but I fail if it's on another page.
I checked and each page is parsed correctly but the return is always undefined if it's not on the first page.
This is my code:
import urllib.request
import json
class Solution:
def __new__(self, character):
url = 'https://challenges.hackajob.co/swapi/api/people/'
numberOfFilms = 0
#
# Some work here; return type and arguments should be according to the problem's requirements
#
numberOfFilms = self.search(self,character,url)
return numberOfFilms
def search(self, character,url):
numberOfFilms = 0
found = False
with urllib.request.urlopen(url) as response:
data = response.read()
jsonData = json.loads(data.decode('utf-8'))
for r in jsonData['results']:
if r['name'] == character:
return len(r['films'])
if (jsonData['next']):
nextPage = jsonData['next']
self.search(self,character,nextPage)
change the last line to return self.search(self,character,nextPage)

Why does my script returns Memory Error?

I'm trying to do a statistics about one web page. This page has categories and products in there. I don't download informations about this products, I'm only counting them.
The point is that I'm getting either Memory Error error or just Some text like Script ends with code -1073741819 (the number is exact).
I've tried to print size of variable category_urls after each loop and it does not increases.
EDIT:
The memory error raises when the category which is being counted is too big (about 60 000 urls).
The main loop is simple:
for category in categories:
count_category(category)
I suppose that after each iteration, the memory should be released but I can't see any release when I look at Task Manager -> Memory tab (Python.exe). I see that the memory consumption is higher and higher.
In case it helps to solve the problem:
def count_category(url):
category_urls = list(get_category_urls(url))
mLib.printToFile('database/count.txt',str(len(category_urls)))
set_spracovanie_kategorie(url) # This fnc just writes category url into text file
def get_category_urls(url):
log('Getting category urls: {}'.format(url))
urls = []
next_url = url
i=1
while next_url:
root = load_root(next_url)
urls.extend(get_products_on_page(root))
for x in urls:
if 'weballow' in x:
yield x
next_url = next_page(root, url) (next page is defined below)
# if next_url == False:
# return urls
i+=1
def get_products_on_page(root):
hrefs = root.xpath('//div[#id="product-contain"]//h2/a/#href')
return hrefs
AND LXML LOADING FUNCTIONS:
class RedirectException(Exception):
pass
def load_url(url):
r = requests.get(url,allow_redirects=False)
if r.status_code == 301:
raise RedirectException
html = r.text
return html
def load_root(url):
html = load_url(url)
return etree.fromstring(html, etree.HTMLParser())
NEXT PAGE:
def next_page(root, url):
next = root.xpath('//a[#class="next"]/#href')
if len(next) > 0:
return urljoin(url, next[0])
return False
Could you give me and advice what to do?

File Storage Problem with Python Web Crawler

I am screen scraping data using a web crawler and storing the results - (tweets from a twitter page) as separate html files for each user I'm crawling. I intend to later parse the html files and store the data into a database for analysis. However, I am having a bizarre problem.
When I run the following program - a small snippet from the overall crawler - I am able to get a separate html file for each follower:
import re
import urllib2
import twitter
start_follower = "NYTimesKrugman"
depth = 3
searched = set()
api = twitter.Api()
def crawl(follower, in_depth):
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest1\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
crawl(name, in_depth-1)
crawl(start_follower, depth)
for x in searched:
print x
print "Program is completed."
However, when I run the full crawler, I do not get a separate file for each follower:
import twitter
import urllib
from BeautifulSoup import BeautifulSoup
import re
import time
start_follower = "NYTimeskrugman"
depth = 2
searched = set()
api = twitter.Api()
def add_to_U(user):
U.append(user)
def site(follower): #creates a twitter site url in string format based on the follower username
followersite = "http://mobile.twitter.com/" + follower
return followersite
def getPage(follower): #obtains access to a webapge
url = site(follower)
response = urllib.urlopen(url)
return response
def getSoup(response): #creates the parsing module
html = response.read()
soup = BeautifulSoup(html)
return soup
def gettweets(soup, output):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
a = tag.renderContents()
b = str (a)
output.write(b)
output.write('\n\n')
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more') != -1:
return True
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def crawl(follower, in_depth): #main method of sorts
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest2\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
a = getPage(follower)
soup = getSoup(a)
gettweets(soup, output)
tweets = are_more_tweets(soup)
while(tweets):
b = getnewlink(soup)
red = urllib.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup, output)
tweets = are_more_tweets(soup)
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
print name
crawl(name, in_depth - 1)
crawl(start_follower, depth)
print("Program done. Look at output file.")
More specifically, I seem to get a separate html file for about the first five followers and then no new files appear to be created. Any help would be appreciated!
The depth value is different between the snippet and the full code (you're only going to get one level of recursion in the full code). Also, you only grab the first five names from the followers list: for name in list(names)[0:5]: So you get six people total: the starting follower and their first five friends.

Categories