I have posted a similar question before, but after reworking the project, I've gotten here:
With two csv files (new.csv, scrapers.csv) -
new.csv contains a single column:
'urls' = whole URLs
scrapers.csv contains two columns:
'scraper_dom' = A simplification of specific URL domains
'scraper_id' = An associated scraper_id that is used to import URLs to a separately managed database
Question
My goal here is to iterate through new.csv (parsing out fnetloc using urlparse) and perform a lookup on scrapers.csv to return a set of matching 'scraper_id' given a set of 'urls' (the way a VLOOKUP would work, or a JOIN in SQL), once urlparse does it's thing to isolate the netloc within the URL (the result of fnetloc).
My next big issue is that urlparse does not parse the URLs (from new.csv) to the exact simplification found in the scrapers.csv file, so I'd be reliant on a sort of partial match until I can figure out the regular expressions to use for that part of it.
I've imported pandas because previous attempts found me creating DataFrames and performing a pd.merge but I couldn't get that to work either...
Current code, commented out bits at the bottom are failed attempts, just thought I'd include what I've tried thus far.
(## are just intermediate print lines I put in to check output of the program)
import pandas as pd, re
from urllib.parse import urlparse
import csv
sd = {}
sid = {}
#INT = []
def fnetloc(any):
try:
p = urlparse(any)
return p.netloc
except IndexError:
return 'Error'
def dom(any):
try:
r = any.split(',')
return r[0]
except IndexError:
return 'Error'
def ids(any):
try:
e = any.split(',')
return e[0]
except IndexError:
return 'Error'
with open('scrapers.csv',encoding='utf-8',newline='') as s:
reader = enumerate(csv.reader(s))
s.readline()
for j, row in reader:
dict1 = dict({'scraper_dom':dom(row[0]), 'scraper_id':ids(row[1])})
sid[j + 1] = dict1
for di in sid.keys():
id = di
##print(sid[di]['scraper_dom'],sid[di]['scraper_id'])
with open('new.csv',encoding='UTF-8',newline='') as f:
reader = enumerate(csv.reader(f))
f.readline()
for i, row in reader:
dict2 = dict({'scraper_domain': fnetloc(row[0])})
sd[i + 1] = dict2
for d in sd.keys():
id = d
##print(sd[d]['scraper_domain'])
#def tryme( ):
#return filter(sd.has_key, sid)
#print(list(filter(sid, sd.keys())))
Sample of desired output.
You just need a procedure that can take a fnetloc and a list of scrapers and check to see if there is a scraper that matches that fnetloc:
def fnetloc_to_scraperid(fnetloc: str, scrapers: List[Scraper]) -> str:
try:
return next(x.scraper_id for x in scrapers if x.matches(fnetloc))
except:
return "[no scraper id found]"
I also recommend that you use some classes instead of keeping everything in csv row objects--it reduces errors in your code, in the long run, and greatly advances your sanity.
This script worked on the sample data I fed it:
import csv
from urllib.parse import urlparse
from typing import List
def fnetloc(any) -> str:
try:
p = urlparse(any)
return p.netloc
except IndexError:
return 'Error'
class Scraper:
def __init__(self, scraper_dom: str, scraper_id: str):
self.scraper_dom = scraper_dom
self.scraper_id = scraper_id
def matches(self, fnetloc: str) -> bool:
return fnetloc.endswith(self.scraper_dom)
class Site:
def __init__(self, url: str):
self.url = url
self.fnetloc = fnetloc(url)
def get_scraperid(self, scrapers: List[Scraper]) -> str:
try:
return next(x.scraper_id for x in scrapers if x.matches(self.fnetloc))
except:
return "[no scraper id found]"
sites = [Site(row[0]) for row in csv.reader(open("new.csv"))]
scrapers = [Scraper(row[0], row[1]) for row in csv.reader(open("scrapers.csv"))]
for site in sites:
print(site.url, site.get_scraperid(scrapers), sep="\t")
Related
I have created a simple API with FastAPI and I want to export the output in a text file (txt).
This is a simplified code
import sys
from clases.sequence import Sequence
from clases.read_file import Read_file
from fastapi import FastAPI
app = FastAPI()
#app.get("/DNA_toolkit")
def sum(input: str): # pass the sequence in, this time as a query param
DNA = Sequence(input) # get the result (i.e., 4)
return {"Length": DNA.length(), # return the response
"Reverse": DNA.reverse(),
"complement":DNA.complement(),
"Reverse and complement": DNA.reverse_and_complement(),
"gc_percentage": DNA.gc_percentage()
}
And this is the output
{"Length":36,"Reverse":"TTTTTTTTTTGGGGGGGAAAAAAAAAAAAAAAATAT","complement":"ATATTTTTTTTTTTTTTTTCCCCCCCAAAAAAAAAA","Reverse and complement":"AAAAAAAAAACCCCCCCTTTTTTTTTTTTTTTTATA","gc_percentage":5.142857142857143}
The file I would like to get
Length 36
Reverse TTTTTTTTTTGGGGGGGAAAAAAAAAAAAAAAATAT
complement ATATTTTTTTTTTTTTTTTCCCCCCCAAAAAAAAAA
Reverse and complement AAAAAAAAAACCCCCCCTTTTTTTTTTTTTTTTATA
There is a simple way to do this. This is my first time working with APIs and I don't even know how possible is this
dict1={"Length":36,"Reverse":"TTTTTTTTTTGGGGGGGAAAAAAAAAAAAAAAATAT","complement":"ATATTTTTTTTTTTTTTTTCCCCCCCAAAAAAAAAA","Reverse and complement":"AAAAAAAAAACCCCCCCTTTTTTTTTTTTTTTTATA","gc_percentage":5.142857142857143}
with open("output.txt","w") as data:
for k,v in dict1.items():
append_data=k+" "+str(v)
data.write(append_data)
data.write("\n")
Output:
Length 36
Reverse TTTTTTTTTTGGGGGGGAAAAAAAAAAAAAAAATAT
complement ATATTTTTTTTTTTTTTTTCCCCCCCAAAAAAAAAA
Reverse and complement AAAAAAAAAACCCCCCCTTTTTTTTTTTTTTTTATA
gc_percentage 5.142857142857143
You can use open method to create a new file, and write your output. And as #Blackgaurd told you, this isn't a code-writing service.
Also I wrote this code really quickly so some syntax error may occur
import sys
import datetime
from clases.sequence import Sequence
from clases.read_file import Read_file
from fastapi import FastAPI
app = FastAPI()
#app.get("/DNA_toolkit")
def sum(input: str): # pass the sequence in, this time as a query param
DNA = Sequence(input) # get the result (i.e., 4)
res = {"Length": DNA.length(), # return the response
"Reverse": DNA.reverse(),
"complement":DNA.complement(),
"Reverse and complement": DNA.reverse_and_complement(),
"gc_percentage": DNA.gc_percentage()
}
#with open('result.txt', 'w+') as resFile:
#for i in res:
#resFile.write(i+" "+res[i]+"\n")
#resFile.close()
# Undo the above comment if you don't want to save result into
#file with unique id, else go with the method I wrote below...
filename = str(datetime.datetime.now().date()) + '_' + str(datetime.datetime.now().time()).replace(':', '.')
with open(filename+'.txt', 'w+') as resFile:
for i in res:
resFile.write(i+" "+res[i]+"\n")
resFile.close()
return {"Length": DNA.length(), # return the response
"Reverse": DNA.reverse(),
"complement":DNA.complement(),
"Reverse and complement": DNA.reverse_and_complement(),
"gc_percentage": DNA.gc_percentage()
}
I gonna assume that you have already got your data somehow calling your API.
# data = request.get(...).json()
# save to file:
with open("DNA_insights.txt", 'w') as f:
for k, v in data.items():
f.write(f"{k}: {v}\n")
I used code from a YouTube video :
import json
import re
import requests
class Helper:
def __init__(self):
pass
def id_from_url(self, url: str):
return url.rsplit("/", 1)[1]
class YouTubeStats:
def __init__(self, url: str):
#self.json_url = urllib.request.urlopen(url)
self.json_url = requests.get(url)
self.data = json.loads(self.json_url.text)
def print_data(self):
print(self.data)
def get_video_title(self):
return self.data["items"][0]["snippet"]["title"]
def get_video_description(self):
return self.data["items"][0]["snippet"]["description"]
api_key = "never-gonna-let-you-know"
link_file = "links.csv"
with open(link_file, "r") as f:
content = f.readlines()
content = list(map(lambda s: s.strip(), content))
content = list(map(lambda s: s.strip(','), content))
helper = Helper()
for youtube_url in content:
video_id = helper.id_from_url(youtube_url)
url = f"https://www.googleapis.com/youtube/v3/search?part=snippet&channelId={video_id}&maxResults=1&order=date&type=video&key={api_key}"
yt_stats = YouTubeStats(url)
title = yt_stats.get_video_title()
description = yt_stats.get_video_description()
print(title)
and when ever i use it keeps on showing funky characters (') instead of an apostrophe.
Note : this might get updated and fix itself since its a API but please use ^ as a reference also my API key might break
Well NVM i figured it out i just needed to use : html.unescape() to convert
I am trying to generate some link.
NOTE: THERE IS PROBLEM WITH return vs print.
when i write the code with with return, it is only return one linK:
run this code:
import requests
import re
wikiurl = 'https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States'
state_pat = re.compile(r'title=\"(\w+)\">')
def get_page_content(url):
response = requests.get(url)
return response.text
def link_generator(wikiurl):
content = get_page_content(wikiurl)
names = state_pat.findall(content)
for i in names:
return 'https://www.local.com/business/results/listing.cfm?s=tile+and+grout+cleaning&ar=' + i + '%2CNY&gsp=ZFZWU1RaU09zWGNYdjFEV1l2ZHFLNVZUUFRPT3c3a21lbFVCbERQOU5VS3p6ai9DRXNMa29PcVZ0ZVV0TXZLM01wUVFUUHZYK2lrMnB5VGJyMHZJeUNoK1dXaUoxZ1NKT3AxbVlJOGN1aVBEb1NRMzlCemdDVHh5aGd3eU5DYUpKWDRtNFVQR0llOFJibUhQR3pSV3ppWFR4ekJoRVltL29UdFQ0MW9KUS9IenJrcjVBMUt3bkErRnlSVnFjRnZ0TjhRWEdET0FuZWRVUGNkemdxUlkzOUYyUjZXbHBzQWRMY3hEUTY4WmtnYkRsSkEvazBrVVY5d0NmSVVMaWp0WnNDNmFsZFNzMitWeHZDYTg2YmJwRGQzSisvOUJaYWNBaFdUd21LaWJpNk9veS9OT1N1VE5DV3RUNDIxdkY5NmZ4bWFVcWtLc1BlVkNRNlEvSG4ydER1T1ZkcXk4Um5BWU5kUU9UZnVOUE9BPQ%253D%253D&lwfilter=&wsrt=&wpn='
a = link_generator(wikiurl)
print(a)
and if i run this code adding a print into fuction, it returns all the link, why? i need all the link with return
run this code: you will see different:
import requests
import re
wikiurl = 'https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States'
state_pat = re.compile(r'title=\"(\w+)\">')
def get_page_content(url):
response = requests.get(url)
return response.text
def link_generator(wikiurl):
content = get_page_content(wikiurl)
names = state_pat.findall(content)
for i in names:
print('https://www.local.com/business/results/listing.cfm?s=tile+and+grout+cleaning&ar=' + i + '%2CNY&gsp=ZFZWU1RaU09zWGNYdjFEV1l2ZHFLNVZUUFRPT3c3a21lbFVCbERQOU5VS3p6ai9DRXNMa29PcVZ0ZVV0TXZLM01wUVFUUHZYK2lrMnB5VGJyMHZJeUNoK1dXaUoxZ1NKT3AxbVlJOGN1aVBEb1NRMzlCemdDVHh5aGd3eU5DYUpKWDRtNFVQR0llOFJibUhQR3pSV3ppWFR4ekJoRVltL29UdFQ0MW9KUS9IenJrcjVBMUt3bkErRnlSVnFjRnZ0TjhRWEdET0FuZWRVUGNkemdxUlkzOUYyUjZXbHBzQWRMY3hEUTY4WmtnYkRsSkEvazBrVVY5d0NmSVVMaWp0WnNDNmFsZFNzMitWeHZDYTg2YmJwRGQzSisvOUJaYWNBaFdUd21LaWJpNk9veS9OT1N1VE5DV3RUNDIxdkY5NmZ4bWFVcWtLc1BlVkNRNlEvSG4ydER1T1ZkcXk4Um5BWU5kUU9UZnVOUE9BPQ%253D%253D&lwfilter=&wsrt=&wpn=')
a = link_generator(wikiurl)
print(a)
When you issue a return statement in a function it doesn't execute any further lines and returns to its caller. If you want to iteratively return items in a generator you can replace return with yield. Alternatively collect the results as a list and return the list.
You then need to change your final line when you're calling this to:
a = list(link_generator(wikiurl))
to unpack your generator
I have the following script that wrote in order to parse some logs. The script works fine, but now i want to add something that will count the duplicates and add that to the list. I have been trying to add that little piece for the last few days and i can't figure out how to do it. Any help would be very much appreciated. I am trying to add it in the function called strip_dupes, or maybe i am looking at it wrong and need to create a new function all together to do this.
import sys
import collections
def strip_dupes(lst):
seen = set()
keep = []
for sd in lst:
if sd in seen:
print(“Duplicate Found: %s” % (sd,))
else:
seen.add(sd)
keep.append(sd)
final_list = keep
return final_list
def format_traffic(ltpl,fname):
ofile = open(output_file, “w”)
for s,d,t in ltpl:
ofile.write(“Source %s -> Destination: %s -> Service: %s\n” % (s,d,t))
ofile.close()
log_file = open(sys.argv[1], “r”)
lines = log_file.readlines()
output_file = sys.argv[2]
traffic_list = []
for l in lines:
words = l.split()
source = words[9].strip(‘“‘)
dest = words[10].strip(‘“‘)
serv = words[7].strip(‘“’)
flow = (source, dest, serv)
traffic_list.append(flow)
final = strip_dupes(traffic_list)
format_traffic(final,output_file)
log_file.close()
I keep getting an error, but i dont see it..
I am new to programing and if you explane me the code, please dont assume I know too much.
#!/usr/bin/env python
# Name:
# Student number:
'''
This script crawls the IMDB top 250 movies.
'''
# Python standard library imports
import os
import sys
import csv
import codecs
import cStringIO
import errno
# Third party library imports:
import pattern
from pattern.web import URL, DOM
# --------------------------------------------------------------------------
# Constants:
TOP_250_URL = 'http://www.imdb.com/chart/top'
OUTPUT_CSV = 'top250movies.csv'
SCRIPT_DIR = os.path.split(os.path.realpath(__file__))[0]
BACKUP_DIR = os.path.join(SCRIPT_DIR, 'HTML_BACKUPS')
# --------------------------------------------------------------------------
# Unicode reading/writing functionality for the Python CSV module, taken
# from the Python.org csv module documentation (very slightly adapted).
# Source: http://docs.python.org/2/library/csv.html (retrieved 2014-03-09).
class UTF8Recoder(object):
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader(object):
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter(object):
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
# --------------------------------------------------------------------------
# Utility functions (no need to edit):
def create_dir(directory):
'''
Create directory if needed.
Args:
directory: string, path of directory to be made
Note: the backup directory is used to save the HTML of the pages you
crawl.
'''
try:
os.makedirs(directory)
except OSError as e:
if e.errno == errno.EEXIST:
# Backup directory already exists, no problem for this script,
# just ignore the exception and carry on.
pass
else:
# All errors other than an already exising backup directory
# are not handled, so the exception is re-raised and the
# script will crash here.
raise
def save_csv(filename, rows):
'''
Save CSV file with the top 250 most popular movies on IMDB.
Args:
filename: string filename for the CSV file
rows: list of rows to be saved (250 movies in this exercise)
'''
with open(filename, 'wb') as f:
writer = UnicodeWriter(f) # implicitly UTF-8
writer.writerow([
'title', 'runtime', 'genre(s)', 'director(s)', 'writer(s)',
'actor(s)', 'rating(s)', 'number of rating(s)'
])
writer.writerows(rows)
def make_backup(filename, html):
'''
Save HTML to file.
Args:
filename: absolute path of file to save
html: (unicode) string of the html file
'''
with open(filename, 'wb') as f:
f.write(html)
def main():
'''
Crawl the IMDB top 250 movies, save CSV with their information.
Note:
This function also makes backups of the HTML files in a sub-directory
called HTML_BACKUPS (those will be used in grading).
'''
# Create a directory to store copies of all the relevant HTML files (those
# will be used in testing).
print 'Setting up backup dir if needed ...'
create_dir(BACKUP_DIR)
# Make backup of the IMDB top 250 movies page
print 'Access top 250 page, making backup ...'
top_250_url = URL(TOP_250_URL)
top_250_html = top_250_url.download(cached=True)
make_backup(os.path.join(BACKUP_DIR, 'index.html'), top_250_html)
# extract the top 250 movies
print 'Scraping top 250 page ...'
url_strings = scrape_top_250(top_250_url)
# grab all relevant information from the 250 movie web pages
rows = []
for i, url in enumerate(url_strings): # Enumerate, a great Python trick!
print 'Scraping movie %d ...' % i
# Grab web page
movie_html = URL(url).download(cached=True)
# Extract relevant information for each movie
movie_dom = DOM(movie_html)
rows.append(scrape_movie_page(movie_dom))
# Save one of the IMDB's movie pages (for testing)
if i == 83:
html_file = os.path.join(BACKUP_DIR, 'movie-%03d.html' % i)
make_backup(html_file, movie_html)
# Save a CSV file with the relevant information for the top 250 movies.
print 'Saving CSV ...'
save_csv(os.path.join(SCRIPT_DIR, 'top250movies.csv'), rows)
This function below, should return the webpage links of the top 250 movies:
# --------------------------------------------------------------------------
# Functions to adapt or provide implementations for:
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
movie_urls = []
table_rows = dom.by_id('main').by_tag('table')[1].by_tag('tr')
for tr in table_rows[1:]:
a = tr.by_tag('a')[0]
movie_urls.append(clean_unicode(abs_url(a.attributes.get('href', ''), url.string)))
# YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
# URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
# return the list of URLs of each movie's page on IMDB
return movie_urls
#print scrape_top_250(url)
And finaly this function should return specific contents.
def scrape_movie_page(dom):
'''
Scrape the IMDB page for a single movie
Args:
dom: pattern.web.DOM instance representing the page of 1 single
movie.
Returns:
A list of strings representing the following (in order): title, year,
duration, genre(s) (semicolon separated if several), director(s)
(semicolon separated if several), writer(s) (semicolon separated if
several), actor(s) (semicolon separated if several), rating, number
of ratings.
'''
# YOUR SCRAPING CODE GOES HERE:
for p in movie_urls:
p_url = URL(p)
p_dom = DOM(p_url.download(cached=True))
title = clean_unicode(p_dom.by_class('header')[0].content)
title = plaintext(strip_between('<span', '</span>', title))
runtime = clean_unicode(p_dom.by_class('infobar')[0].by_tag('time')[0].content)
duration = runtime
genres = []
for genre in p_dom.by_class('infobar')[0].by_tag('a')[:-1]:
genres.append(clean_unicode(genre.content))
directors = []
writers = []
actors = []
text_blocks = p_dom.by_class('txt-block')[:3]
for t in text_blocks:
spans = t.by_tag('span')
for s in spans:
if s.attributes.get('itemprop') == 'director':
director = s.by_tag('span')[0].by_tag('a')[0].content
directors.append(clean_unicode(director))
if s.attributes.get('itemprop') == 'writer':
p_writer = s.by_tag('span')[0].by_tag('a')[0].content
writers.append(clean_unicode(p_writer))
if s.attributes.get('itemprop') == 'actors':
actor = s.by_tag('span')[0].by_tag('a')[0].content
actors.append(clean_unicode(actor))
rating = []
ratings_count = []
spans = p_dom.by_class('star-box-details')[0].by_tag('span')
for s in spans:
if s.attributes.get('itemprop') == 'ratingValue':
rating = clean_unicode(s.content)
if s.attributes.get('itemprop') == 'ratingCount':
ratings_count = clean_unicode(s.content)
# format the strings from lists
genres = concat_strings(genres)
directors = concat_strings(directors)
writers = concat_strings(writers)
actors = concat_strings(actors)
# Return everything of interest for this movie (all strings as specified
# in the docstring of this function).
return title, duration, genres, directors, writers, actors, rating, \
n_ratings
if __name__ == '__main__':
main() # call into the progam
# If you want to test the functions you wrote, you can do that here:
# ...
It's just that (in the original revision) you forgot to indent the body of the function scrape_movie_page. The for loop is in module scope.
Most common reason for cause of this error due to not proper indent the body of the function, but some time code looks proper a for as indentation point of view but still it throw same error. I always saw this error comes due to mismatch in indentation.In same block if you use two type of indentation like in same block if for some line you use tab and and for some line you use spaces, code looks good as for as indentation prospective but it always through indentation error.