I used Wget to download webpages.
I want to ask "Is it possible to get the URL from local html files?"
I used python to analysis the html file content. I want to print all files' URL.
I am trying to add more functions on this program, so I think if I can print the URL follow result, then user can easy click link to get the webpage.
Here is my code:
def search(self):
keyword = self.entry.get()
mypath = "/Users/Tsu-AngChou/MasterProject/Practice/try_test/"
files = listdir(mypath)
translator = str.maketrans("","",string.punctuation)
count1 = 0
test_list = []
test_list2 = []
for f in files:
fullpath = join(mypath, f)
if f == '.DS_Store':
os.remove(f)
elif isfile(fullpath):
# print(f)
for html_cont in range(1):
response = open(f,'r',encoding='utf-8')
html_cont = response.read()
soup = bs(html_cont, 'html.parser')
regular_string = soup.get_text()
new_string = regular_string.translate(translator).split()
new_list = [item[:14] for item in new_string]
a = dict.fromkeys(new_list, f)
wordfreq = []
c = new_list
for w in c:
wordfreq.append(c.count(w))
fre = dict(zip(c,wordfreq))
sentence= new_list
keyword1= keyword
words = sentence
if keyword in fre:
test_list.append(a[keyword])
test_list2.append(fre[keyword])
count1 = count1+1
for (i, subword) in enumerate(words):
if (subword == keyword1):
test_list3= i+1
for i in range(0,count1-1):
for j in range(0,count1-1-i):
if (test_list2[j]<test_list2[j+1]):
temp=test_list[j]
temp2=test_list2[j]
test_list[j]=test_list[j+1]
test_list2[j]=test_list2[j+1]
test_list[j+1]=temp
test_list2[j+1]=temp2
for i in range(0,count1):
print(keyword, "Filename:", test_list[i], "Frequency:", test_list2[i])
return a
This is my output, and I want to have the link follow every result.
Related
So I am using append to extend my list of scraped apartments. In this code I run in to a problem, because I have created second for loop to change the page in the website. So first for loop gives new page to the next for loop to scrape. But when its done with one page it just overwrites the last list. What I am doing wrong?
for page in range(1, 4): # Gives new page to scrape
r = requests.get( url + str(page))
soup = bs(r.content)
apartments = soup.select(".ListPage__cardContainer__39dKQ")
base_path = "https://www.etuovi.com"
x = []
apartment_list = []
for index ,apartment in enumerate(apartments):
if index == 2: # Just to not scrape every item
break
relative_path = apartment.a['href']
full_path = base_path + relative_path
id_number = apartment.a['id']
apartment_list.append(get_apartment_data(full_path)) #This works for one page
x.append(apartment_list) # Tried to make this work.. Think one list should be enaught.
And the functions:
def get_content_value(info_list_data):
if info_list_data.find("li"):
return [li.get_text(" ", strip=True).replace("\xa0", "").replace("€", "").replace("/ kk",
"").replace("\n", "") for li in info_list_data.find_all("li")]
else:
return info_list_data.get_text(" ", strip=True).replace("\xa0" , "").replace("€", "").replace("/
kk", "").replace("\n", "")
Last:
def get_apartment_data(url):
r = requests.get(url)
soup = bs(r.content)
all_info_list = soup.find_all(class_ = "CompactInfoRow__infoRow__2hjs_ flexboxgrid__row__wfmuy")
for info_list in all_info_list:
info_list.prettify()
info = {}
for index, info_list in enumerate(all_info_list):
content_key = info_list.find(class_ = "flexboxgrid__col-xs-12__1I1LS flexboxgrid__col-sm-4__3RH7g
ItemHeader__itemHeader__32xAv").get_text(" ", strip=True)
content_value = get_content_value(info_list.find(class_ = "flexboxgrid__col-xs-12__1I1LS
flexboxgrid__col-sm-8__2jfMv CompactInfoRow__content__3jGt4"))
info[content_key] = content_value
return info
for page in range(1, 4): # Gives new page to scrape
r = requests.get( url + str(page))
soup = bs(r.content)
apartments = soup.select(".ListPage__cardContainer__39dKQ")
base_path = "https://www.etuovi.com"
x = []
apartment_list = []
for index ,apartment in enumerate(apartments):
if index == 2: # Just to not scrape every item
break
relative_path = apartment.a['href']
full_path = base_path + relative_path
id_number = apartment.a['id']
apartment_list.append(get_apartment_data(full_path)) #This works for one page
x.append(apartment_list.copy())
You need to use the copy() method to make an independent copy. Otherwise each time you make a new apartment_list, it will change in your x list too. Like twins lists.
More generally:
x = []
lst = [1,2,3]
x.append(lst)
print (x)
lst[0] = 0
x.append(lst)
print (x)
Output:
[[1,2,3]]
[[0,2,3],[0,2,3]]
The right way is:
x = []
lst = [1,2,3]
x.append(lst.copy())
print (x)
lst[0] = 0
x.append(lst.copy())
print (x)
Output:
[[1,2,3]]
[[1,2,3],[0,2,3]]
I am trying to get arrays from the string. i want the texts as an output, the texts should be in array format.
And i got this error for line in txt.readline():
AttributeError: 'unicode' object has no attribute 'readline'
def de_gutenberger(filename):
with open(filename, 'rb') as f:
txt = f.read()
author, title = get_author_and_title(txt)
# get rid of header & footer
start_txt = "START OF THIS PROJECT GUTENBERG EBOOK"
end_txt = "END OF THIS PROJECT GUTENBERG EBOOK"
start = re.search('('+start_txt+').*?\n', txt)
start_ind = start.end() if start != None else 0
end = re.search('('+end_txt+').*?\n', txt)
end_ind = end.start() if end != None else len(txt)
word_string = stem_and_stop(nltk.word_tokenize(txt[start_ind:end_ind]))
return author, title, word_string
def get_books(folder, files):
authors = []
texts = []
titles = []
for i, f in enumerate(files):
print("Number {} of {}".format(i+1, len(files)))
author, title, txt = de_gutenberger(folder + f)
for line in txt.readlines():
y = [value for value in line.split()]
texts.append( y )
authors.append(author)
titles.append(title)
What does de_gutenberger do? It is returning a tuple, but the 3rd object is NOT a file object, but rather a unicode string.
Try this:
def get_books(folder, files):
authors = []
texts = []
titles = []
for i, f in enumerate(files):
print("Number {} of {}".format(i+1, len(files)))
author, title, txt = de_gutenberger(folder + f)
y = [value for value in txt.split()]
texts.append( y )
authors.append(author)
titles.append(title)
how to call function inside for loop with PYTHON
must call this funtion
def EE():
print("dd")
inside this
def upload_file(request):
if request.method == 'POST':
form = UploadFileForm(request.POST, request.FILES)
files = request.FILES.getlist('file_field')
fs = FileSystemStorage()
for f in files:
filename = fs.save(f.name, f)
ee=EE()
print(ee)
number_of_files=len(files)
uploaded_file_url = fs.url(filename)
return render(request, 'core/simple_upload.html', {
# 'uploaded_file_url': uploaded_file_url
})
The way you have written is correct. Since your function doesn't returns any value, I doubt whether you will receive the desired output.
Assuming the function to be called and the other are in the same scope.
def sample_function():
return "This is a sample function."
def main_function():
# function call
x = sample_function()
print(x)
# add your logic here.
Hope this will help.
def sentence_finder(text, word):
sentences = sent_tokenize(text)
return [sent for sent in sentences if word in word_tokenize(sent)]
def EE(filename,no_of_files):
for i in range(no_of_files):
try:
print('\n')
print(i+1)
pdfFileObj = open(filename, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
count = 0
text = ""
# The while loop will read each page
while count < num_pages:
pageObj = pdfReader.getPage(count)
count += 1
text += pageObj.extractText()
# This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
if text != "":
text = text
# If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
else:
text = textract.process(filename, method='tesseract', language='eng')
# select relevnt section
# education qulification
textt = re.search(r'EDUCATION\n.*?SKILLS', text, re.DOTALL).group()
edu_qulification = textt[textt.find('\n') + 1:textt.rfind('\n')]
srt1=edu_qulification.lower()
# print(edu_qulification)
str12 = srt1.replace("\n", ". ")
str2 = str12.replace("m.s.", "master")
# print(str2)
syn = synonyms = wordnet.synsets('degree')
syn_set1 = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
syn = synonyms = wordnet.synsets('BACHELOR')
syn_set2 = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
syn = synonyms = wordnet.synsets('Master')
syn_set3 = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
listone = ['bsc','be', 'btech']
listtwo =['m.s.']
mergedlist = listone + syn_set1 + syn_set2 + syn_set3 + listtwo
# print(mergedlist)
for i in mergedlist:
sent_part=sentence_finder(str2,i)
# print(sent_part)
if not sent_part:
pass
else:
Digree = sentence_finder(str2, i)
synn = synonyms = wordnet.synsets('university')
syn_seta = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
synn = synonyms = wordnet.synsets('institute')
syn_setb= list(chain.from_iterable([word.lemma_names() for word in synonyms]))
synn = synonyms = wordnet.synsets('college')
syn_setc = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
listthree=['center']
mergedlistt = listthree + syn_seta + syn_setb + syn_setc
# print(mergedlistt)
for j in mergedlistt:
sent_partt = sentence_finder(str2, j)
# print(sent_partt)
if not sent_partt:
pass
else:
University = sentence_finder(str2, j)
# Digree = sentence_finder(str2, 'BACHELOR')
# University = sentence_finder(str2, 'UNIVERSITY')
print(Digree)
print(University)
print(".................................................................")
# print(University)
except:
print("No Education Qualification mentioned")
I've basically created a spider that follows a set of links acquired from an API, and then extracts text from the HTML body. I'm trying to append returned items to appropriate lists, which are then added to a dictionary. When I run the code, the resultant JSON file only successfully writes the first line.
I am running Python 3.6 in a virtual environment on a Windows 10 64-bit machine, and I run pip-upgrade daily.
from nltk.corpus import stopwords
import smtplib
from time import sleep # To prevent overwhelming the server between connections
from bs4 import BeautifulSoup as soup
import scrapy
import mysql.connector as mariadb
import sys
from collections import Counter
from pprint import pprint
import json
import re
conn = mariadb.connect(user=dbuser, password=dbpassword, database=dbdatabase)
c = conn.cursor()
e = sys.exc_info()[0]
c.execute("Select URL FROM [TABLE]")
JobURLs = c.fetchall()
for object in JobURLs:
urls = []
url_string = str(object)
rx = re.compile('\W\W\W$')
res = rx.sub('', url_string)
rx = re.compile('^\W\W')
url = rx.sub('', res)
urls.append(url)
c.execute("Select JvId FROM [TABLE]")
JobIDs = c.fetchall()
for object in JobIDs:
item = {}
item['JvId'] = []
JobID_string = str(object)
rx = re.compile('\W\W\W$')
res = rx.sub('', JobID_string)
rx = re.compile('^\W\W')
JobID = rx.sub('', res)
item['JvId'].append(JobID)
class JobListing(scrapy.Spider):
name = 'JobListingCrawler'
start_urls = urls
def parse(self, response):
# pass
item['urlText'] = response.url
page_html = response.body
page_soup = soup(page_html, 'lxml')
for script in page_soup(['script', 'style']):
script.extract()
item['jobDescText'] = page_soup.get_text('''\n''', strip=True)
## TextCleaner Function for Word Counter
text = item['jobDescText'].replace('\n', ' ')
lines = [line.strip() for line in text.splitlines()]
chunks = [phrase.strip() for line in lines for phrase in line.split(' ')]
def chunk_space(chunk):
chunk_out = chunk + ' '
return chunk_out
text = ''.join(chunk_space(chunk) for chunk in chunks if chunk).encode('utf-8')
try:
text = text.decode('unicode_escape').encode('ascii', 'ignore')
except:
print(e)
pass
text = re.sub('[^a-zA-Z,+3]', ' ', str(text))
text = text.lower().split()
stop_words = set(stopwords.words('english'))
text = [word for word in text if not word in stop_words]
wordCounter = Counter(text)
item['wordCounter'] = str(wordCounter)
## And now we parse for email addresses!
prog = re.compile(r"[A-z0-9._%+-]+#[A-z0-9.-]+\.[A-z]{2,}")
found = prog.search(item['jobDescText'].replace('\n', ' '))
try:
item['email'] = str(found.group(0))
except:
item['email'] = 'null'
pass
filename = 'results.jl'
line = json.dumps(dict(item)) + '\n'
with open(filename, 'a') as f:
f.write(line)
self.log('Saved Line to %s' % filename)
You just need to declare a Scrapy Item, which contains yours returned fields definion.
After that, just need to config your setting file to allow Scrapy Feed Exports using the built-in JsonItemExporter for your extract data:
FEED_URI: file:///tmp/export.json
FEED_FORMAT: json
So silly me: I put the list variable within the For Loop, so each time the actions looped it would delete the previously written values. Moving them outside of the loop solved the problem.
c.execute("Select URL FROM CareerOneStopJobs")
JobURLs = c.fetchall()
urls = []
for element in JobURLs:
url_string = str(element)
rx = re.compile('\W\W\W$')
res = rx.sub('', url_string)
rx = re.compile('^\W\W')
url = rx.sub('', res)
urls.append(url)
c.execute("Select JvId FROM CareerOneStopJobs")
JobIDs = c.fetchall()
item = {}
for JobID in JobIDs:
item['JvId'] = []
JobID_string = str(JobID)
rx = re.compile('\W\W\W$')
res = rx.sub('', JobID_string)
rx = re.compile('^\W\W')
JobID = rx.sub('', res)
item['JvId'] = JobID
I am working on a text search project. I have 2 lists.
a = ['ibm','dell']
b =['strength','keyword']##this is a list of keywords given by the user
Now i create combinations for searching google.
lst = list(itertools.product(a, b))
What i need help on is below:
using the code i will search for the text using different keywords and their lemma. After that I need to write the searched text to an excel file. I need to create worksheets with the names in list A and write it only the searched text in different worksheets. I am not able to figure. below is part of my code.
def getarticle(url,n):
final =[]
regex ='(.*).pdf'
pattern = re.compile(regex)
if re.match(pattern,url) is not None:
text = pdf_to_text(url)
final.append('')
final.append(url)
final.append(text)
New_file = open((('text' + str((round(random.random(),2))) + '.txt')),'w+')
New_file.write(smart_str(unicode(text,'utf-8')))
New_file.close()
else:
br = mechanize.Browser()
br.set_handle_robots(False)
br.addheaders = [('User-agent','Chrome')]
html = br.open(url).read()
titles = br.title()
readable_article= Document(html).summary()
readable_title = Document(html).short_title()
soup = bs4.BeautifulSoup(readable_article)
Final_Article = soup.text
final.append(titles)
final.append(url)
final.append(Final_Article)
raw = nltk.clean_html(html)
cleaned = re.sub(r'& ?(ld|rd)quo ?[;\]]', '\"', raw)
tokens = nltk.wordpunct_tokenize(raw)
lmtzr = WordNetLemmatizer()
t = [lmtzr.lemmatize(t) for t in tokens]
text = nltk.Text(t)
word = words(n)
find = ' '.join(str(e) for e in word)
search_words = set(find.split(' '))
sents = ' '.join([s.lower() for s in text])
blob = TextBlob(sents.decode('ascii','ignore'))
matches = [map(str, blob.sentences[i-1:i+2]) # from prev to after next
for i, s in enumerate(blob.sentences) # i is index, e is element
if search_words & set(s.words)]
return ''.join (str(y).replace('& rdquo','').replace('& rsquo','') for y in matches)
This returns the text now i need to write to excel files which i am unable to code.
As far as writing text out to a file Excel can read is concerned, ou might want to look at Python's csv library, which provides lots of useful .csv manipulation tools.