I've got a csv file with URL's and I need to scrape metadata from those website. I'm using python requests for that reasons with code below:
from tempfile import NamedTemporaryFile
import shutil
import csv
from bs4 import BeautifulSoup
import requests
import re
import html5lib
import sys
#import logging
filename = 'TestWTF.csv'
#logging.basicConfig(level=logging.DEBUG)
#Get filename (with extension) from terminal
#filename = sys.argv[1]
tempfile = NamedTemporaryFile(delete=False)
read_timeout = 1.0
#Does actual scraping done, returns metaTag data
def getMetadata (url, metaTag):
r = requests.get("http://" + url, timeout=2)
data = r.text
soup = BeautifulSoup(data, 'html5lib')
metadata = soup.findAll(attrs={"name":metaTag})
return metadata
#Gets either keyword or description
def addDescription ( row ):
scrapedKeywordsData = getMetadata(row, 'keywords')
if not scrapedKeywordsData:
print row + ' NO KEYWORDS'
scrapedKeywordsData = getMetadata(row, 'description')
if not scrapedKeywordsData:
return ''
return scrapedKeywordsData[0]
def prepareString ( data ):
output = data
#Get rid of opening meta content
if output.startswith( '<meta content="' ):
output = data[15:]
#Get rid of closing meta content (keywords)
if output.endswith( '" name="keywords"/>' ):
output = output[:-19]
#Get rid of closing meta content (description)
if output.endswith( '" name="description"/>' ):
output = output[:-22]
return output
def iterator():
with open(filename, 'rb') as csvFile, tempfile:
reader = csv.reader(csvFile, delimiter=',', quotechar='"')
writer = csv.writer(tempfile, delimiter=',', quotechar='"')
i = 0
for row in reader:
try:
data = str(addDescription (row[1] ))
row[3] = prepareString( data )
except requests.exceptions.RequestException as e:
print e
except requests.exceptions.Timeout as e:
print e
except requests.exceptions.ReadTimeout as e:
print "lol"
except requests.exceptions.ConnectionError as e:
print "These aren't the domains we're looking for."
except requests.exceptions.ConnectTimeout as e:
print "Too slow Mojo!"
writer.writerow(row)
i = i + 1
print i
shutil.move(tempfile.name, filename)
def main():
iterator()
#Defining main function
if __name__ == '__main__':
main()
It works just fine but at some URL's (out of 3000 let's say maybe 2-3) it would just suddenly stop working and not progress to next one after timeout time.. So I have to kill it using Ctr+C which results in file not being saved.
I know it's a problem of catching exceptions but I cannot figure out which one or what to do with that problem.. I'm more than happy to simply ignore the one which is stuck on..
EDIT:
Added traceback:
^CTraceback (most recent call last):
File "blacklist.py", line 90, in <module>
main()
File "blacklist.py", line 85, in main
iterator()
File "blacklist.py", line 62, in iterator
data = str(addDescription (row[1] ))
File "blacklist.py", line 30, in addDescription
scrapedKeywordsData = getMetadata(row, 'keywords')
File "blacklist.py", line 25, in getMetadata
metadata = soup.findAll(attrs={"name":metaTag})
File "/Library/Python/2.7/site-packages/bs4/element.py", line 1259, in find_all
return self._find_all(name, attrs, text, limit, generator, **kwargs)
File "/Library/Python/2.7/site-packages/bs4/element.py", line 537, in _find_all
found = strainer.search(i)
File "/Library/Python/2.7/site-packages/bs4/element.py", line 1654, in search
found = self.search_tag(markup)
File "/Library/Python/2.7/site-packages/bs4/element.py", line 1626, in search_tag
if not self._matches(attr_value, match_against):
File "/Library/Python/2.7/site-packages/bs4/element.py", line 1696, in _matches
if isinstance(markup, Tag):
KeyboardInterrupt
EDIT 2:
Example website for which script doesn't work: miniusa.com
Related
I'm writing a short piece of code in python to check the status code of a list of URLS. The steps are
1. read the URL's from a csv file.
2. Check request code
3. Write the status code request into the csv next to the checked URL
The first two steps I've managed to do but I'm stuck with writing the output of the requests into the same csv, next to the urls. Please help.
import urllib.request
import urllib.error
from multiprocessing import Pool
file = open('innovators.csv', 'r', encoding="ISO-8859-1")
urls = file.readlines()
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
print('HTTPError: {}'.format(e.code) + ', ' + url)
except urllib.error.URLError as e:
print('URLError: {}'.format(e.reason) + ', ' + url)
else:
print('200' + ', ' + url)
if __name__ == "__main__":
p = Pool(processes=1)
result = p.map(checkurl, urls)
with open('innovators.csv', 'w') as f:
for line in file:
url = ''.join(line)
checkurl(urls + "," + checkurl)
The .readlines() operation leaves the file object at the end of file. When you attempt to loop through the lines of file again, without first rewinding it (file.seek(0)) or closing and opening it again (file.close() followed by opening again), there are no lines remaining. Always recommended to use with open(...) as file construct to ensure file is closed when operation is finished.
Additionally, there appears to be an error in your input to checkurl. You have added a list (urls) to a string (",") to a function (checkurl).
You probably meant for this section to read
with open('innovators.csv', 'w') as f:
for line in urls:
url = ''.join(line.replace('\n','')) # readlines leaves linefeed character at end of line
f.write(url + "," + checkurl(url))
The checkurl function should return what you are intending to place into the csv file. You are simply printing to standard output (screen). Thus, replace your checkurl with
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
ret='0'
except urllib.error.HTTPError as e:
ret='HTTPError: {}'.format(e.code)
except urllib.error.URLError as e:
ret='URLError: {}'.format(e.reason)
else:
ret='200'
return ret
or something equivalent to your needs.
Save the status in a dict. and convert it to dataframe. Then simply send it to a csv file. str(code.getcode()) will return 200 if the url is connecting else it will return an exception, for which i assigned status as '000'. So your csv file will contain url,200 if URL is connecting and url,000 if URL is not connecting.
status_dict={}
for line in lines:
try:
code = urllib.request.urlopen(line)
status = str(code.getcode())
status_dict[line] = status
except:
status = "000"
status_dict[line] = status
df = pd.Dataframe(status_dict)
df.to_csv('filename.csv')
I created this code to scan my samples_vsdt.txt getting a certain values then writing it in a csv, I'm having an error StopIteration and doesn't even read the text file. I'm trying to solve this for hours, any idea what's causing the problem?
Here is how my code works, Example this line:
Scanning samples_extracted\82e5b144cb5f1c10629e72fc1291f535db7b0b40->(Word 2003 XML Document 1003-1)
Will be written to csv as this:
82e5b144cb5f1c10629e72fc1291f535db7b0b40,Word 2003 XML Document 1003-1
Here is my code, and this is working for all my txt_files but this one sample_vsdt.txt doesn't work properly
import csv,re
out_vsdt = "samples_vsdt.txt"
out_sha1_vsdt = "sha1_vsdt.csv"
def read_text_file(out_vsdt):
with open(out_vsdt) as f:
data = []
for line in f:
if "Scanning " + new in line and "(" in line:
try:
sha = re.search('\\\(.*)->', line).group(1)
desc= re.search('->\((.*)\)', line).group(1)
except AttributeError:
desc = None
sha = None
mix = sha,desc
data.append(mix)
continue
if "Scanning " + new in line:
try:
sha= re.search('\\\(.*)$', line).group(1)
while True:
i = next(f)
if "(" in i:
try:
desc = re.search('->\((.*)\)', i).group(1)
break
except AttributeError:
desc = None
sha = None
mix = sha,desc
data.append(mix)
except AttributeError:
sha = None
return data
def write_csv_file(data,out_sha1_vsdt):
with open(out_sha1_vsdt, 'wb') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"')
csvwriter.writerow(['SHA-1','VSDT','DESC'])
for row in data:
csvwriter.writerow(row)
def main():
data = read_text_file(out_vsdt)
write_csv_file(data, out_sha1_vsdt)
if __name__ == '__main__':
main()
print "Parsing Successful"
Gives me error:
Traceback (most recent call last):
File "C:\Users\trendMICRO\Desktop\ojt\scanner\parser.py", line 65, in <module>
main()
File "C:\Users\trendMICRO\Desktop\ojt\scanner\parser.py", line 61, in main
data = read_text_file(out_vsdt)
File "C:\Users\trendMICRO\Desktop\ojt\scanner\parser.py", line 37, in read_text_file
i = next(f)
StopIteration
An alternative approach could be to just use a regular expression to extract whole blocks:
import csv
import re
out_vsdt = "samples_vsdt.txt"
out_sha1_vsdt = "sha1_vsdt.csv"
with open(out_vsdt) as f_input:
vscan32 = f_input.read()
with open(out_sha1_vsdt, 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['SHA-1', 'VSDT', 'DESC'])
for sha, desc, vsdt in re.findall(r'Scanning.*?\\([0-9a-f]+)(.*?)->\((.*?)\)$', vscan32, re.S + re.M):
desc = '|'.join(line.strip() for line in desc.splitlines() if len(line.strip()))
desc = ''.join(filter(lambda x: x in string.printable, desc)) # remove non-printable characters
csv_output.writerow([sha, vsdt, desc])
This uses a multi-line expression that looks for blocks starting with Scanning. Where there are multiple lines, the lines are stripped and joined together using a |. Finally any non-printable characters are removed from the description.
This would give you an output starting something like:
SHA-1,VSDT,DESC
004d44eeecae27314f8bd3825eb82d2f40182b51,WIN32 EXE 7-2,
07eab9ea58d4669febf001d52c5182ecf579c407,WIN32 EXE 7-2,
0d558bb5e0a5b544621af0ffde1940615ac39deb,WIN32 EXE 7-2,
5172c70c1977bbddc2a163f6ede46595109c7835,WIN32 EXE 7-2,- $R0\NsCpuCNMiner32.exe->Found Virus [WORM_CO.331300D2]|- $R0\NsCpuCNMiner64.exe->Found Virus [WORM_CO.331300D2]|- $R0\NsGpuCNMiner.exe->Found Virus [TROJ64_.743CC567]
This assumes you are using Python 3.x
I downloaded a zip file from https://clinicaltrials.gov/AllPublicXML.zip, which contains over 200k xml files (most are < 10 kb in size), to a directory (see 'dirpath_zip' in the CODE) I created in ubuntu 16.04 (using DigitalOcean). What I'm trying to accomplish is loading all of these into MongoDB (also installed in the same location as the zip file).
I ran the CODE below twice and consistently failed when processing the 15988th file.
I've googled around and tried reading other posts regarding this particular error, but couldn't find a way to solve this particular issue. Actually, I'm not really sure what problem really is... any help is much appreciated!!
CODE:
import re
import json
import zipfile
import pymongo
import datetime
import xmltodict
from bs4 import BeautifulSoup
from pprint import pprint as ppt
def timestamper(stamp_type="regular"):
if stamp_type == "regular":
timestamp = str(datetime.datetime.now())
elif stamp_type == "filename":
timestamp = str(datetime.datetime.now()).replace("-", "").replace(":", "").replace(" ", "_")[:15]
else:
sys.exit("ERROR [timestamper()]: unexpected 'stamp_type' (parameter) encountered")
return timestamp
client = pymongo.MongoClient()
db = client['ctgov']
coll_name = "ts_"+timestamper(stamp_type="filename")
coll = db[coll_name]
dirpath_zip = '/glbdat/ctgov/all/alltrials_20180402.zip'
z = zipfile.ZipFile(dirpath_zip, 'r')
i = 0
for xmlfile in z.namelist():
print(i, 'parsing:', xmlfile)
if xmlfile == 'Contents.txt':
print(xmlfile, '==> entering "continue"')
continue
else:
soup = BeautifulSoup(z.read(xmlfile), 'lxml')
json_study = json.loads(re.sub('\s', ' ', json.dumps(xmltodict.parse(str(soup.find('clinical_study'))))).strip())
coll.insert_one(json_study)
i+=1
ERROR MESSAGE:
Traceback (most recent call last):
File "zip_to_mongo_alltrials.py", line 38, in <module>
soup = BeautifulSoup(z.read(xmlfile), 'lxml')
File "/usr/local/lib/python3.5/dist-packages/bs4/__init__.py", line 225, in __init__
markup, from_encoding, exclude_encodings=exclude_encodings)):
File "/usr/local/lib/python3.5/dist-packages/bs4/builder/_lxml.py", line 118, in prepare_markup
for encoding in detector.encodings:
File "/usr/local/lib/python3.5/dist-packages/bs4/dammit.py", line 264, in encodings
self.chardet_encoding = chardet_dammit(self.markup)
File "/usr/local/lib/python3.5/dist-packages/bs4/dammit.py", line 34, in chardet_dammit
return chardet.detect(s)['encoding']
File "/usr/lib/python3/dist-packages/chardet/__init__.py", line 30, in detect
u.feed(aBuf)
File "/usr/lib/python3/dist-packages/chardet/universaldetector.py", line 128, in feed
if prober.feed(aBuf) == constants.eFoundIt:
File "/usr/lib/python3/dist-packages/chardet/charsetgroupprober.py", line 64, in feed
st = prober.feed(aBuf)
File "/usr/lib/python3/dist-packages/chardet/hebrewprober.py", line 224, in feed
aBuf = self.filter_high_bit_only(aBuf)
File "/usr/lib/python3/dist-packages/chardet/charsetprober.py", line 53, in filter_high_bit_only
aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf)
File "/usr/lib/python3.5/re.py", line 182, in sub
return _compile(pattern, flags).sub(repl, string, count)
MemoryError
Try to push reading from file and inserting into db in another method.
Also add gc.collect() for garbage collection.
import gc;
def read_xml_insert(xmlfile):
soup = BeautifulSoup(z.read(xmlfile), 'lxml')
json_study = json.loads(re.sub('\s', ' ', json.dumps(xmltodict.parse(str(soup.find('clinical_study'))))).strip())
coll.insert_one(json_study)
for xmlfile in z.namelist():
print(i, 'parsing:', xmlfile)
if xmlfile == 'Contents.txt':
print(xmlfile, '==> entering "continue"')
continue;
else:
read_xml_insert(xmlfile);
i+=1
gc.collect()
`
Please see.
Hello all I keep getting this error while making a small program to sort large CSV files out, below is my code and error, what am I doing wrong?
if selection:
for stuff in stuffs:
try:
textFile = open("output.txt",'w')
mycsv = csv.reader(open(stuff))
d_reader = csv.DictReader(mycsv)
headers = d_reader.fieldnames <-- Error happens here
if selection in headers:
placeInList = headers.index(selection)
#placeInList = selection.index(selection)
for selection in tqdm(mycsv, desc='Extracting column values...', leave = True):
textFile.write(str(selection[int(placeInList)])+'\n')
print 'Done!'
textFile.close()
sys.exit()
except IOError:
print 'No CSV file present in directory'
sys.exit()
else:
sys.exit()
And the error:
Traceback (most recent call last):
File "postcodeExtractor.py", line 27, in <module> headers = d_reader.fieldnames
File "C:\Python27\lib\csv.py", line 90, in fieldnames self._fieldnames = self.reader.next()
TypeError: expected string or Unicode object, list found
instead of
mycsv = csv.reader(open(stuff))
d_reader = csv.DictReader(mycsv)
you want
d_reader = csv.DictReader(open(stuff))
the first line is the problem.
print (data[1])
ymcacanada.ca
answers = dns.resolver.query(data[1]), 'MX')
traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "dns\resolver.py", line 981, in query
File "dns\resolver.py", line 912, in query
File "dns\resolver.py", line 143, in __init__
dns.resolver.NoAnswer
I expected the data[1] to be equal to "ymcacanada", so I can call the whole data list in a for loop with the dns.resolver looking up the MX records.
What I got was this error that does not happen when I manually run the code with the url.
The next thing I want to do is write those lookups into a .CSV
Here's my code so far(doesn't work!)
import dns
import dns.resolver
import os
import csv
import dns.resolver
file = "domains.txt"
f = open(file)
data = f.read()
f.close
list = []
for url in data:
answers = dns.resolver.query(url, 'MX')
for rdata in answers:
x = [rdata.exchange, rdata.preference]
print(x)
list.append([x])
EDIT
Hey All here is my working Code in it's totality. I'm sure it can be improved but I'm still learning!
import dns
import dns.resolver
import os
import csv
##tested on python 2.7 only
errcountA = 0
errcountMX = 0
listoflists = []
with open("domains.txt") as f:
for url in f:
A = []
MX = []
url = url.strip()
try:
answers = dns.resolver.query(url, 'A') ##Get the A record
for rdata in answers:
A.append(rdata.address) ## Add A Records to list
except: ## Incase the URL doesnt resolve
A = "Error resolving"
errcountA += 1
try: ##Get the MX record
answers = dns.resolver.query(url, 'MX')
for rdata in answers:
MX.append([rdata.preference, rdata.exchange])
except: ##incase url doesnt resolver
MX = "Error resolving"
errcountMX += 1
list = [url, MX, A]
print(list)
listoflists.append(list)
with open('output.csv', 'wb') as csvfile: ##write the csv file
writer = csv.writer(csvfile)
for r in listoflists:
writer.writerow(r)
print listoflists
print ("There were %e A record errors") %errcountA
print ("There were %f MX record errors") %errcountMX
print ("Done!")
Your url data includes the trailing newline. Use .strip() to remove any leading or trailing whitespace from your data. Try this:
with open("domaints.txt") as f:
for url in f:
url = url.strip()
answers = dns.resolver.query(url, 'MX')
# Continue as before