Parsing GeoRSS feed with python scripting - python

Have an exam tomorrow and need to get my program.py file working. I am to parse through a GeoRSS feed https://www.tvfoodmaps.com/MVFN.xml to be specific and obtain these attribute data; "Latitude", "Longitude" , "Title" , "Description" to compile into separate list. After creating these lists, I need to write to create a feature class that would hold these points and data in arcmap. The script is to be run in ArcMap to map out the restaurant locations and contain the information.
Right now I'm stuck on getting all the data into tables. The problem is at the get title and description parts because it seems that if I can get the Titles, then it doesn't run Descriptions and vice versa. Any help would be really appreciated! Here's what I have so far;
import os, urllib
#store the pathname to where you want to add text file to
#path = arcpy.GetParameterAsText(0) # pathname to folder
#FullFCOutputPath = arcpy.GetParameterAsText(1)
path = "https://www.tvfoodmaps.com/MVFN.xml"
f = urllib.urlopen(path)
myfile = f.read()
lstFieldNames = [ "Latitude", "Longitude" , "Title" , "Description" ]
lstPoints = myfile.split('<georss:point>')
#print lstPoints[1]
Latitudes = []
Longitudes = []
for Gval in lstPoints:
if Gval.find('</georss:point>') <> -1:
LatPos1 = 0
LatPos2 = Gval.index(' ')
LonPos1 = Gval.index(' ') + 1
LonPos2 = Gval.index('</georss:point>')
Latitudes.append(Gval[LatPos1:LatPos2])
Longitudes.append(Gval[LonPos1:LonPos2])
lstTitles = myfile.split('<item>')
Titles = []
Descriptions = []
#print lstTitles[1]
for Tval in lstTitles:
if Tval.find('<item>') <> -1: #
TlePos1 = Tval.index('<title>') + 7
TlePos2 = Tval.index('</title>')
Title = (Tval[TlePos1:TlePos2])
Title = Title.replace(''',"'")
Titles.append(Title)
elif Tval.find('</description>') <> -1:
DesPos1 = Tval.index('<description>') + 13
DesPos2 = Tval.index('</description>')
Description = (Tval[DesPos1:DesPos2])
Description = Description.replace(''',"'")
Descriptions.append(Description)

Related

Remove row from the CSV file if condition met

I am trying to scrape pickels.com.au.
I am trying to update the pickels_dataset.csv file if the link is the same and if the price is not the same them I am removing the list and inserting the new row to the CSV file, but it doesn't remove the old entry from the CSV file.
What would be the best way to remove and update the row in the CSV file.
Below is my code...
import requests
from scrapy.selector import Selector
import csv
import re
from tqdm import tqdm
from time import sleep
with open('pickels_dataset.csv', 'a+', newline='', encoding='utf-8') as auction_csv_file:
auction_csv_writer = csv.writer(auction_csv_file)
live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
api_request = requests.get(url=live_auctions_api)
for auctions in api_request.json():
auction_link = auctions.get('viewSaleListingLink')
if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
auction_request = requests.get(url=auction_link)
response = Selector(text=auction_request.text)
sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
if sales_id == []:
continue
auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
auction_sale_link_requests = requests.get(url=auction_sale_link)
auctions_data = auction_sale_link_requests.json().get('SearchResults')
if auctions_data == []:
print("NO RESULTS")
for auction_data in auctions_data:
if int(auction_data.get('MinimumBid')) > 0:
ids = auction_data.get('TargetId')
main_title = auction_data.get('Title')
short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model'))
make = auction_data.get('M ake')
model = auction_data.get('Model')
variant = auction_data.get('Series')
transmission = auction_data.get('Transmission')
odometer = auction_data.get('Odometer')
state = auction_data.get('Location').get('State')
sale_price = auction_data.get('MinimumBid')
link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
sale_date = auction_data.get('SaleEndString')
auction_values = [
main_title, short_title, make,
model, variant, transmission, odometer,
state, "${:,.2f}".format(sale_price).strip() ,
link, sale_date
]
with open('pickels_dataset.csv', 'r+') as csv_read:
auction_reader = list(csv.reader(csv_read))
for each in auction_reader:
if link in each:
each_link, each_price = each[9], each[0]
if (link == each_link) and (sale_price != each_price):
auction_reader.clear()
print('New list found, old list deleted')
auction_csv_writer.writerow(auction_values)
print('New value added')
continue
elif (link == each[9]) and (sale_price == each[0]):
print('Same result already exist in the file')
continue
else:
auction_csv_writer.writerow(auction_values)
print('Unique result found and added.')
break
Your current script is opening your auction CSV file for appending, and then whilst it is still open, attempting to open it again for reading. This is probably why it is not updating as expected.
A better approach would be to first read the entire contents of your existing saved auction file into a dictionary. The key could be the link which would then make it easy to determine if you have already seen an existing auction.
Next scrape the current auctions and update the saved_auctions dictionary as needed.
Finally at the end, write the contents of saved_auctions back to the CSV file.
For example:
import requests
from scrapy.selector import Selector
import csv
import re
auction_filename = 'pickels_dataset.csv'
# Load existing auctions into a dictionary with link as key
saved_auctions = {}
with open(auction_filename, newline='', encoding='utf-8') as f_auction_file:
for row in csv.reader(f_auction_file):
saved_auctions[row[9]] = row # dictionary key is link
live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
api_request = requests.get(url=live_auctions_api)
for auctions in api_request.json():
auction_link = auctions.get('viewSaleListingLink')
if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
auction_request = requests.get(url=auction_link)
response = Selector(text=auction_request.text)
sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
if sales_id == []:
continue
auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
auction_sale_link_requests = requests.get(url=auction_sale_link)
auctions_data = auction_sale_link_requests.json().get('SearchResults')
if auctions_data == []:
print("NO RESULTS")
for auction_data in auctions_data:
if int(auction_data.get('MinimumBid')) > 0:
ids = auction_data.get('TargetId')
main_title = auction_data.get('Title')
short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model'))
make = auction_data.get('Make')
model = auction_data.get('Model')
variant = auction_data.get('Series')
transmission = auction_data.get('Transmission')
odometer = auction_data.get('Odometer')
state = auction_data.get('Location').get('State')
minimum_bid = auction_data.get('MinimumBid')
sale_price = "${:,.2f}".format(minimum_bid).strip()
link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
sale_date = auction_data.get('SaleEndString')
auction_values = [
main_title, short_title, make,
model, variant, transmission, odometer,
state, sale_price,
link, sale_date
]
if link in saved_auctions:
if saved_auctions[link][8] == sale_price:
print('Same result already exists in the file')
else:
print('New value updated')
saved_auctions[link] = auction_values # Updated the entry
else:
print('New auction added')
saved_auctions[link] = auction_values
# Update the saved auction file
with open(auction_filename, 'w', newline='', encoding='utf-8') as f_auction_file:
csv_auction_file = csv.writer(f_auction_file)
csv_auction_file.writerows(saved_auctions.values())
If you want to also remove auctions that are no longer active, then it would probably be best to simply ignore the saved file and just write all current entries as is.

How to scrape data from PDF into Excel

I am trying to scrape the data from PDF and get it saved into an excel file. This is the pdf I needed: https://www.medicaljournals.se/acta/content_files/files/pdf/98/219/Suppl219.pdf
However, I need to scrape not all the data but the following one (below), and then saved it to excel in different cells:
From page 5, starting from P001 to and including Introduction - there is a P number, title, people names, and Introduction.
For now, I can only convert a PDF file into text (my code below) and save it all in one cell, but I need it to be separated into a different cells
import PyPDF2 as p2
PDFfile = open('Abstract Book from the 5th World Psoriasis and Psoriatic Arthritis
Conference 2018.pdf', 'rb')
pdfread = p2.PdfFileReader(PDFfile)
pdflist = []
i = 6
while i<pdfread.getNumPages():
pageinfo = pdfread.getPage(i)
#print(pageinfo.extractText())
i = i + 1
pdflist.append(pageinfo.extractText().replace('\n', ''))
print(pdflist)
The main you need is 'header' regex as 15 UPPERcase letters and 'article' regex letter 'P' and 3 digits.
One more regex helps you to divide your text by any of keywords
article_re = re.compile(r'[P]\d{3}') #P001: letter 'P' and 3 digits
header_re = re.compile(r'[A-Z\s\-]{15,}|$') #min 15 UPPERCASE letters, including '\n' '-' and
key_word_delimeters = ['Peoples', 'Introduction','Objectives','Methods','Results','Conclusions','References']
file = open('data.pdf', 'rb')
pdf = pdf.PdfFileReader(file)
text = ''
for i in range(6, 63):
text += pdf.getPage(i).extractText() # all text in one variable
articles = []
for article in re.split(article_re, text):
header = re.match(header_re, article) # recieving a match
other_text = re.split(header_re, article)[1] # recieving other text
if header:
header = header.group() # get text from match
item = {'header': header}
first_name_letter = header[-1] # save the first letter of name to put it in right position. Some kind of HOT BUGFIX
header = header[:-1] # cut last character: the first letter of name
header = header.replace('\n', '') #delete linebreakers
header = header.replace('-', '') #delete line break symbol
other_text = first_name_letter + other_text
data_array = re.split(
'Introduction:|Objectives:|Methods:|Results:|Conclusions:|References:',
other_text)
for key, data in zip(key_word_delimeters, data_array):
item[key] = data.replace('\n', '')
articles.append(item)

Finding a piece of information in a document and deleting everything before and after

I have some .docx files that are very specifically formatted.
I have copied the file 5 times to represent the 5 different strings that I require to be "found" and everything else removed.
#! python 3
import docx
import os
import shutil
import readDocx as rD
def delete_paragraph(paragraph):
p = paragraph._element
p.getparent().remove(p)
p._p = p._element = None
#Select the file you want to work with
fP = rD.file
#get the working directory for the file
nfP = os.path.dirname(os.path.abspath(fP))
#print (nfP)
#Break the filepath into parts
fileSplit = fP.split('/')
#Get the filename only
fileCode = fileSplit[-1]
#print (fileCode)
#Seperate the course code
nameSplit = fileCode.split(' ')
courseCode = nameSplit[0]
#print (courseCode)
#List of files that we need to create
a1 = "Assessment Summary"
a2 = "Back to Business project"
a3 = "Back to Business Checklist"
a4 = "Skills Demonstration"
a5 = "Skills Demonstration Checklist"
names = [a1, a2, a3, a4, a5]
#Creates a list for the new filenames to sit in
newFiles = []
#Creates the files from the original
for name in names:
fileName = os.path.join(nfP + '\\' + courseCode + ' ' + str(name) + ' ' +'Version 1.0' + '.docx')
shutil.copy(fP, fileName)
#print(fileName)
newFiles.append(fileName)
#print (newFiles)
#Need to iterate through the files and start deleting data.
h1 = "Learner Declaration"
h2 = "Back to Business Project"
h3 = "Assessor Observation Checklist / Marking Guide"
h4 = "Skills Demonstration"
h5 = "Assessor Observation Checklist / Marking Guide"
This is where I start to fail in my limited skill. The h1-5 tags represent the heading of the document pieces that I want to keep.
How can I iterate through the document, find the heading and delete everything before / after these paragraphs?
I don't necessarily need the answer, just more of a "look in this direction".
Thanks
Try this. Have clearly mentioned in the comments what the code does.
from docx import Document #Package "Python-docx" needs to be installed to import this
import pandas as pd
# Read the document into a python-docx Document object
document = Document('Path/to/your/input/.docx/document')
#Initialize an empty dataframe to store the .docx document into a dataframe along with the style of each paragraph
document_text_dataframe = pd.DataFrame(columns=['para_text','style'])
#Iterate through the "document" object for extracting the paragraph texts along with their styles into the dataframe "document_text_dataframe"
for para in document.paragraphs:
#Extract paragraph style
style = str(para.style.name)
##### For headings which are created as NORMAL style but are BOLD, we need to extract them as well-
##### Ideally these represent headings as well.
runboldtext = ''
for run in para.runs:
if run.bold:
runboldtext = runboldtext + run.text
if runboldtext == str(para.text) and runboldtext != '':
print("Bold True for:",runboldtext)
style = 'Heading'
#################################################################
dftemp = pd.DataFrame({'para_text':[para.text],'style':[style]})
document_text_dataframe=document_text_dataframe.append(dftemp,sort=False) # Now append each paragraph along with its style into "document_text_dataframe"
document_text_dataframe = document_text_dataframe.reset_index(drop=True)
#Need to iterate through the files and start deleting data.
h1 = "Learner Declaration"
h2 = "Back to Business Project"
h3 = "Assessor Observation Checklist / Marking Guide"
h4 = "Skills Demonstration"
h5 = "Assessor Observation Checklist / Marking Guide"
h_list = [h1,h2,h3,h4]
#Initialize a list to store the extracted information relevant to each "h" value and store them in it
extracted_content=[]
for h in h_list:
df_temp = pd.DataFrame(columns=['para_text','style'])
###########Loop through the document to extract the content related to each "h" value######
start_index=0
end_index=0
for index, row in document_text_dataframe.iterrows():
if h == row['para_text']:
print("Found match in document for: ",h)
start_index = index
print("Matching index=",index)
break
if start_index != 0:
for i in range(start_index+1,len(document_text_dataframe)-1):
if 'Heading' in document_text_dataframe.loc[i,'style']:
end_index = i
break
if end_index !=0:
for i in range(start_index,end_index):
df_temp = df_temp.append(document_text_dataframe.loc[i])
############################################################################################
#Append every extracted content into the list "extracted_content"
if start_index != 0 and end_index!=0:
extracted_content.append(df_temp)
#The list "extracted_content" will consist of dataframes. Each dataframe will correspond to the extracted information of each "h" value.
print(extracted_content)
Now, using extracted_content, you can write every entry in the list extracted_content to a separate .docx document using your code.
Cheers!

A loop to extract URLS from several text files

I am attempting to extract a list of URLS from several files using a for loop, however this is resulting in a list of URLS from only the first file, repeated 10 times. I'm not sure what I am doing wrong. Also, I am an absolute beginner at this, so I will presume that there are much better ways of trying to achieve what I want, however this is what I have so far.
type_urls = []
y = 0
for files in cwk_dir:
while y < 10:
open('./cwkfiles/cwkfile{}.crawler.idx'.format(y))
lines = r.text.splitlines()
header_loc = 7
name_loc = lines[header_loc].find('Company Name')
type_loc = lines[header_loc].find('Form Type')
cik_loc = lines[header_loc].find('CIK')
filedate_loc = lines[header_loc].find('Date Filed')
url_loc = lines[header_loc].find('URL')
firstdata_loc = 9
for line in lines[firstdata_loc:]:
company_name = line[:type_loc].strip()
form_type = line[type_loc:cik_loc].strip()
cik = line[cik_loc:filedate_loc].strip()
file_date = line[filedate_loc:url_loc].strip()
page_url = line[url_loc:].strip()
typeandurl = (form_type, page_url)
type_urls.append(typeandurl)
y = y + 1
Here is a more Pythonic way using pathlib and Python 3:
from pathlib import Path
cwk_dir = Path('./cwkfiles')
type_urls = []
header_loc = 7
firstdata_loc = 9
for cwkfile in cwk_dir.glob('cwkfile*.crawler.idx'):
with cwkfile.open() as f:
lines = f.readlines()
name_loc = lines[header_loc].find('Company Name')
type_loc = lines[header_loc].find('Form Type')
cik_loc = lines[header_loc].find('CIK')
filedate_loc = lines[header_loc].find('Date Filed')
url_loc = lines[header_loc].find('URL')
for line in lines[firstdata_loc:]:
company_name = line[:type_loc].strip()
form_type = line[type_loc:cik_loc].strip()
cik = line[cik_loc:filedate_loc].strip()
file_date = line[filedate_loc:url_loc].strip()
page_url = line[url_loc:].strip()
type_urls.append((form_type, page_url))
If you want to test on a small batch of files, replace cwk_dir.glob('cwkfile*.crawler.idx') with cwk_dir.glob('cwkfile[0-9].crawler.idx'). That will give you the first then files if they are sequentially numbered, starting from 0.
And here is better way to put it all together and in a more readable way:
from pathlib import Path
def get_offsets(header):
return dict(
company_name = header.find('Company Name'),
form_type = header.find('Form Type'),
cik = header.find('CIK'),
file_date = header.find('Date Filed'),
page_url = header.find('URL')
)
def get_data(line, offsets):
return dict(
company_name = line[:offsets['form_type']].strip(),
form_type = line[offsets['form_type']:offsets['cik']].strip(),
cik = line[offsets['cik']:offsets['file_date']].strip(),
file_date = line[offsets['file_date']:offsets['page_url']].strip(),
page_url = line[offsets['page_url']:].strip()
)
cwk_dir = Path('./cwkfiles')
types_and_urls = []
header_line = 7
first_data_line = 9
for cwkfile in cwk_dir.glob('cwkfile*.crawler.idx'):
with cwkfile.open() as f:
lines = f.readlines()
offsets = get_offsets(lines[header_line])
for line in lines[first_data_line:]:
data = get_data(line, offsets)
types_and_urls.append((data['form_type'], data['page_url']))
When you get to the second file, the while condition fails as y is already 10.
Try setting y back to 0 just before the while loop:
for files in cwk_dir:
y = 0
while y < 10:
...
And as you're opening the file in the first line inside the while loop, you probably need to close it when exiting the loop.

split() issues with pdf extractText()

I'm working on a minor content analysis program that I was hoping that I could have running through several pdf-files and return the sum of frequencies that some specific words are mentioned in the text. The words that are searched for are specified in a separate text file (list.txt) and can be altered. The program runs just fine through files with .txt format, but the result is completely different when running the program on a .pdf file. To illustrate, the test text that I have the program running trhough is the following:
"Hello
This is a product development notice
We’re working with innovative measures
A nice Innovation
The world that we live in is innovative
We are currently working on a new process
And in the fall, you will experience our new product development introduction"
The list of words grouped in categories are the following (marked in .txt file with ">>"):
innovation: innovat
product: Product, development, introduction
organization: Process
The output from running the code with a .txt file is the following:
Whereas the ouput from running it with a .pdf is the following:
As you can see, my issue is pertaining to the splitting of the words, where in the .pdf output i can have a string like "world" be split into 'w','o','rld'. I have tried to search for why this happens tirelessly, without success. As I am rather new to Python programming, I would appreciate any answe or direction to where I can fin and answer to why this happens, should you know any source.
Thanks
The code for the .txt is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.txt'
textfile = open(f)
text = textfile.read().split() # lowercase the text
print (text)
textfile.close()
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in text:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])
While the code for the .pdf is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.pdf'
textfile = open(f, 'rb')
text = PyPDF2.PdfFileReader(textfile)# lowercase the text
for pageNum in range(0, text.numPages):
texts = text.getPage(pageNum)
textfile = texts.extractText().split()
print (textfile)
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in textfile:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])

Categories