reading files python - python

I want to read files in an advanced mode.
First:
In this file, I have certain steps with which the code has to follow, how do I read the steps until the string [data] appears.
[Steps]
step1 = WebAddress
step2 = Tab
step3 = SecurityType
step4 = Criteria
step5 = Date
step6 = Click1
step7 = Results
step8 = Download
[data]
......
Second:
How can I read all everything after [data].
[data]
WebAddress___________________________ Destination___________ Tab_____________ SecurityType___________________________________________________ Criteria___ Date_______ Click1_ Results_ Download
https://mbsdisclosure.fanniemae.com/ q:\\%s\\raw\\fnmapool Advanced Search Interim MBS: Single-Family Issue Date 09/01/2012 Search 100 CSV XML
https://mbsdisclosure.fanniemae.com/ q:\\%s\\raw\\fnmapool Advanced Search Preliminary Mega: Fannie Mae/Ginnie Mae backed Adjustable Rate Issue Date 09/01/2012 Search 100 CSV XML
https://mbsdisclosure.fanniemae.com/ q:\\%s\\raw\\fnmapool Advanced Search Preliminary Mega: Fannie Mae/Ginnie Mae backed Fixed Rate Issue Date 09/01/2012 Search 100 CSV XML
I want to pass everything under the step____________________ where step can be the steps(e.g. WebAddress).
So for example, if step1 = WebAddress how do I read everything under WebAddress__________________________ and so on? Thanks!

First:
with open(file_name) as f:
print (f.read()).split("[data]")
Second:
with open(file_name) as f:
pre_data,post_data =[s.strip() for s in (f.read()).split("[data]")]
post_data_lines = post_data.splitlines()
headers = post_data_lines[0].split()
print headers
for line in post_data_lines[1:]:
print line.split()
print dict(zip(headers,line.split()))
Im also not sure how your [data]is delimited you may want line.split('\t') if its tabbed
this is untested... but it should work and it doesnt quite get you all the way where you want but at least it gets most of what your want (the "hard" parts)
to split by header width use
file_name = "testdata.txt"
with open(file_name) as f:
pre_data,post_data =[s.strip() for s in (f.read()).split("[data]")]
post_data_lines = post_data.splitlines()
headers = post_data_lines[0].split()
for line in post_data_lines[1:]:
tmpline = []
pos = 0
for itm in headers:
tmpline.append(line[pos:pos+len(itm)])
pos += len(itm)+1
print dict(zip(headers,tmpline))
and if you want the actual header with out the __'s then use
file_name = "testdata.txt"
with open(file_name) as f:
pre_data,post_data =[s.strip() for s in (f.read()).split("[data]")]
post_data_lines = post_data.splitlines()
headers = post_data_lines[0].split()
headers2 = [s.replace("_"," ").strip() for s in headers]
for line in post_data_lines[1:]:
tmpline = []
pos = 0
for itm in headers:
tmpline.append(line[pos:pos+len(itm)])
pos += len(itm)+1
print dict(zip(headers2,tmpline))

First step:
>>> import ConfigParser
>>> cfg = ConfigParser.RawConfigParser()
>>> with open('sample.cfg') as f:
... cfg.readfp(f)
...
>>> cfg.get('Steps','step1')
'WebAddress'
Second step:
>>> data_section = ''
>>> with open('sample.cfg') as f:
... data_section = f.read()
...
>>> data = data_section[data_section.index('[data]')+len('[data]')+1:]
>>> reader = csv.reader(io.BytesIO(data),delimiter='\t')
>>> reader.next() # skips header
>>> results = [row for for row in reader]
Now results is a list of lists, with each inner list having items from the data section.
[['https://mbsdisclosure.fanniemae.com/','q:\\\\%s\\\\raw\\\\fnmapool','Advanced Search', 'Interim MBS: Single-Family', 'Issue Date','09/01/2012','Search','100', 'CSV XML']...]

Related

Writing Printed Output to CSV - Numpy

I want this output written via CSV
['https://www.lendingclub.com/loans/personal-loans' '6.16% to 35.89%']
['https://www.lendingclub.com/loans/personal-loans' '1% to 6%']
['https://www.marcus.com/us/en/personal-loans' '6.99% to 24.99%']
['https://www.marcus.com/us/en/personal-loans' '6.99% to 24.99%']
['https://www.marcus.com/us/en/personal-loans' '6.99% to 24.99%']
['https://www.marcus.com/us/en/personal-loans' '6.99% to 24.99%']
['https://www.marcus.com/us/en/personal-loans' '6.99% to 24.99%']
['https://www.discover.com/personal-loans/' '6.99% to 24.99%']
However when I run the code to write the output to CSV I only get the last line written to the CSV file:
['https://www.discover.com/personal-loans/' '6.99% to 24.99%']
Could it be because my printed output is not comma separated? I attempted to circumvent having to put a comma in there by using a space as the delimiter. Let me know your thoughts. Would love some help on this because I am having the hardest time reshaping this collected data.
plcompetitors = ['https://www.lendingclub.com/loans/personal-loans',
'https://www.marcus.com/us/en/personal-loans',
'https://www.discover.com/personal-loans/']
#cycle through links in array until it finds APR rates/fixed or variable using regex
for link in plcompetitors:
cdate = datetime.date.today()
l = r.get(link)
l.encoding = 'utf-8'
data = l.text
soup = bs(data, 'html.parser')
#captures Discover's rate perfectly but catches too much for lightstream/prosper
paragraph = soup.find_all(text=re.compile('[0-9]%'))
for n in paragraph:
matches = re.findall('(?i)\d+(?:\.\d+)?%\s*(?:to|-)\s*\d+(?:\.\d+)?%', n.string)
try:
irate = str(matches[0])
array = np.asarray(irate)
array2 = np.append(link,irate)
array2 = np.asarray(array2)
print(array2)
#with open('test.csv', "w") as csv_file:
# writer = csv.writer(csv_file, delimiter=' ')
# for line in test:
# writer.writerow(line)
except IndexError:
pass
When it comes to using csv file, pandas comes handy.
import datetime
import requests as r
from bs4 import BeautifulSoup as bs
import numpy as np
import regex as re
import pandas as pd
plcompetitors = ['https://www.lendingclub.com/loans/personal-loans',
'https://www.marcus.com/us/en/personal-loans',
'https://www.discover.com/personal-loans/']
df = pd.DataFrame({'Link':[],'APR Rate':[]})
#cycle through links in array until it finds APR rates/fixed or variable using regex
for link in plcompetitors:
cdate = datetime.date.today()
l = r.get(link)
l.encoding = 'utf-8'
data = l.text
soup = bs(data, 'html.parser')
#captures Discover's rate perfectly but catches too much for lightstream/prosper
paragraph = soup.find_all(text=re.compile('[0-9]%'))
for n in paragraph:
matches = re.findall('(?i)\d+(?:\.\d+)?%\s*(?:to|-)\s*\d+(?:\.\d+)?%', n.string)
irate = ''
try:
irate = str(matches[0])
df2 = pd.DataFrame({'Link':[link],'APR Rate':[irate]})
df = pd.concat([df,df2],join="inner")
except IndexError:
pass
df.to_csv('CSV_File.csv',index=False)
I have stored each link and it's irate value in a data frame df2 and I concatenate it to parent data frame df.
At the end, I write parent data frame df to a csv file.
I think the problem is that you are opening the file in write-mode (the "w" in open('test.csv', "w")), meaning that Python overwrites what's already written in the file. I think you're looking for append-mode:
# open the file before the loop, and close it after
csv_file = open("test.csv", 'a') # change the 'w' to an 'a'
csv_file.truncate(0) # clear the contents of the file
writer = csv.writer(csv_file, delimiter=' ') # make the writer beforehand for efficiency
for n in paragraph:
matches = re.findall('(?i)\d+(?:\.\d+)?%\s*(?:to|-)\s*\d+(?:\.\d+)?%', n.string)
try:
irate = str(matches[0])
array = np.asarray(irate)
array2 = np.append(link,irate)
array2 = np.asarray(array2)
print(array2)
for line in test:
writer.writerow(line)
except IndexError:
pass
# close the file
csv_file.close()
If this doesn't work, please let me know!

Sorting and sequencing data from a file

I've developed a program that stores a list of ids, so:
But for the desired purposes, the data should take the sequential form, so that the first pair of ids is something like: "889926212541448192" becomes 1 and "889919950248448000" becomes 2. That is, the file to be should be something like:
Where the first id connects with 2,3 and 6, and the id 4 only with 5, forming a network.
I have no experience in this area, but I can not find a way to do this reading.
I tried to do some programs, but they only read row and not column id to id. This data is saved following the following program
import json
arq = open('ids.csv','w')
arq.write('Source'+','+'Target')
arq.write("\n")
lista_rede = [] #list to store all ids
with open('dados_twitter.json', 'r') as f:
for line in f:
lista = []
tweet = json.loads(line) # to write as a Python dictionary
lista = list(tweet.keys()) #write list of keys
try:
if 'retweeted_status' in lista:
id_rt = json.dumps(tweet['retweeted_status']['id_str'])
id_status = json.dumps(tweet['id_str'])
lista_rede.append(tweet['id_str'])
lista_rede.append(tweet['retweeted_status']['id_str'])
arq.write( id_status +','+ id_rt )
arq.write("\n")
if tweet['quoted_status'] in lista :
id_rt = json.dumps(tweet['quoted_status']['id_str'])
id_status = json.dumps(tweet['id_str'])
lista_rede.append(tweet['id_str'])
lista_rede.append(tweet['quoted_status']['id_str'])
arq.write( id_status +','+ id_rt )
arq.write("\n")
except:
continue
arq.close()
As a result I have a file with ids data in pairs of interactions.
How can I then rearrange these data in reading, or even how to write them ?? In Python or another language?
The following snippet would do the job-
import re
header = ''
id_dict = {}
# read the ids
with open('ids.csv') as fr:
header = fr.readline()
for line in fr:
ids = [int(s) for s in re.findall(r'\d+', line)]
try:
id_dict[int(ids[0])].append(int(ids[1]))
except:
id_dict[int(ids[0])] = [int(ids[1])]
# sort the ids
for key in id_dict:
id_dict[key].sort()
# save the sorted ids in a new file
with open('ids_sorted.txt', 'w') as fw:
# fw.write(header)
for key in sorted(id_dict):
for value in id_dict[key]:
fw.write("{0} {1}\n".format(key, value))

Removing phrases from a file based on another file (Python)

How do I do this in python?
badphrases.txt contains
Go away
Don't do that
Stop it
allphrases.txt contains
I don't know why you do that. Go away.
I was wondering what you were doing.
You seem nice
I want allphrases.txt to be clean of the lines in badphrases.txt.
It's trivial in bash
cat badfiles.txt | while read b
do
cat allphrases.txt | grep -v "$b" > tmp
cat tmp > allphrases.txt
done
Oh, you thought I hadn't looked or tried. I searched for over and hour.
Here's my code:
# Files
ttv = "/tmp/tv.dat"
tmp = "/tmp/tempfile"
bad = "/tmp/badshows"
badfiles already exists
...code right here creates ttv
# Function grep_v
def grep_v(f,str):
file = open(f, "r")
for line in file:
if line in str:
return True
return False
t = open(tmp, 'w')
tfile = open(ttv, "r")
for line in tfile:
if not grep_v(bad,line):
t.write(line)
tfile.close
t.close
os.rename(tmp, ttv)
First google how to read a file in python:
you will probably get something like this: How do I read a file line-by-line into a list?
Use this to read both the files in lists
with open('badphrases.txt') as f:
content = f.readlines()
badphrases = [x.strip() for x in content]
with open('allphrases.txt') as f:
content = f.readlines()
allphrases = [x.strip() for x in content]
Now you have both the content in lists.
Iterate over allphrases and check if phrases from badphrases are present in it.
At this point you might consider google :
how to iterate over a list python
how to check if string present in another string python
Take the code from those places and built a brute-force algo like this:
for line in allphrases:
flag = True
for badphrase in badphrases:
if badphrase in line:
flag = False
break
if flag:
print(line)
If you can understand this code then you will notice you need to replace print with output to file:
Now google how to print to file python.
Then think about how to improve the algorithm. All the best.
UPDATE:
#COLDSPEED suggested you can simple google
- how to replace lines in a file in python:
You might get something like this: Search and replace a line in a file in Python
Which also works.
Solution not too bad.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import feedparser, os, re
# Files
h = os.environ['HOME']
ttv = h + "/WEB/Shows/tv.dat"
old = h + "/WEB/Shows/old.dat"
moo = h + "/WEB/Shows/moo.dat"
tmp = h + "/WEB/Shows/tempfile"
bad = h + "/WEB/Shows/badshows"
# Function not_present
def not_present(f,str):
file = open(f, "r")
for line in file:
if str in line:
return False
return True
# Sources (shortened)
sources = ['http://predb.me/?cats=tv&rss=1']
# Grab all the feeds and put them into ttv and old
k = open(old, 'a')
f = open(ttv, 'a')
for h in sources:
d = feedparser.parse(h)
for post in d.entries:
if not_present(old,post.link):
f.write(post.title + "|" + post.link + "\n")
k.write(post.title + "|" + post.link + "\n")
f.close
k.close
# Remove shows without [Ss][0-9] and put them in moo
m = open(moo, 'a')
t = open(tmp, 'w')
file = open(ttv, "r")
for line in file:
if re.search(r's[0-9]', line, re.I) is None:
m.write(line)
# print("moo", line)
else:
t.write(line)
# print("tmp", line)
t.close
m.close
os.rename(tmp, ttv)
# Remove badshows
t = open(tmp, 'w')
with open(bad) as f:
content = f.readlines()
bap = [x.strip() for x in content]
with open(ttv) as f:
content = f.readlines()
all = [x.strip() for x in content]
for line in all:
flag = True
for b in bap:
if b in line:
flag = False
break
if flag:
t.write(line + "\n")
t.close
os.rename(tmp, ttv)

How to save multiple xml files in python

I'm attempting to get a series of weather reports from a website, I have the below code which creates the needed URLs for the XMLs I want, what would be the best way to save the returned XMLs with different names?
with open('file.csv') as csvfile:
towns_csv = csv.reader(csvfile, dialect='excel')
for rows in towns_csv:
x = float(rows[2])
y = float(rows[1])
url = ("http://api.met.no/weatherapi/locationforecast/1.9/?")
lat = "lat="+format(y)
lon = "lon="+format(x)
text = url + format(lat) + ";" + format(lon)
I have been saving single XMls with this code;
response = requests.get(text)
xml_text=response.text
winds= bs4.BeautifulSoup(xml_text, "xml")
f = open('test.xml', "w")
f.write(winds.prettify())
f.close()
The first column of the CSV file has city names on it, I would ideally like to use those names to save each XML file as it is created. I'm sure another for loop would do, I'm just not sure how to create it.
Any help would be great, thanks again stack.
You have done most of the work already. Just use rows[0] as your filename. Assuming rows[0] is 'mumbai', then rows[0]+'.xml' will give you 'mumbai.xml' as the filename. You might want to check if city names have spaces which need to be removed, etc.
with open('file.csv') as csvfile:
towns_csv = csv.reader(csvfile, dialect='excel')
for rows in towns_csv:
x = float(rows[2])
y = float(rows[1])
url = ("http://api.met.no/weatherapi/locationforecast/1.9/?")
lat = "lat="+format(y)
lon = "lon="+format(x)
text = url + format(lat) + ";" + format(lon)
response = requests.get(text)
xml_text=response.text
winds= bs4.BeautifulSoup(xml_text, "xml")
f = open(rows[0]+'.xml', "w")
f.write(winds.prettify())
f.close()

Cleaning up a messy data file to a more readable format in Python?

I have a text file (heavily modified for this example) which has some data that I want to extract and do some calculations with it. However the text file is extremely messy, so I'm trying to clean it up and write it out to new files first.
Here is the .txt file I'm working with: http://textuploader.com/5elql
I am trying to extract the data which is under the titles (called “Important title”). The only possible way to do that is to first locate a string which always occurs in the file, and its called “DATASET” because all the mess above and below the important data will cover an arbitrary number of lines, difficult to remove manually. Once that’s done I want to store the data in separate files so that it is easier to analyse like this:
http://textuploader.com/5elqw
The file names will be concatenated with the title + the date.
Here is what I have tried so far
with open("example.txt") as file:
for line in file:
if line.startswith('DATASET:'):
fileTitle = line[9:]
if line.startswith("DATE:"):
fileDate = line[:]
print(fileTitle+fileDate)
OUTPUT
IMPORTANT TITLE 1
DATE: 12/30/2015
IMPORTANT TITLE 2
DATE: 01/03/2016
So it appears my loop manages to locate the lines where the titles inside the file are and print them out. But this is where I run out of steam. I have no idea on how to extract the data under those titles from there onwards. I have tried using file.readlines() but it outputs all the mess that is in between Important Title 1 and Important Title 2.
Any advice on how I can read all the data under the titles and output them into separate files? Thanks for your time.
You could use regex.
import re
pattern = r"(\s+X\s+Y\s*)|(\s*\d+\s+\d+\s*)"
prog = re.compile(pattern)
with open("example.txt") as file:
cur_filename = ''
content = ""
for line in file:
if line.startswith('DATASET:'):
fileTitle = line[9:]
elif line.startswith("DATE:"):
fileDate = line[6:]
cur_filename = (fileTitle.strip() + fileDate.strip()).replace('/', '-')
print(cur_filename)
content_title = fileTitle + line
elif prog.match(line):
content += line
elif cur_filename and content:
with open(cur_filename, 'w') as fp:
fp.write(content_title)
fp.write(content)
cur_filename = ''
content = ''
I don't know exactly how you want to store your data but assuming you want a dictionary you could use regex to check if the incoming line matched the pattern, then because fileTitle isn't global you could use that as the key and add the values. I also added rstrip('\r\n') to remove the newline characters after fileTitle.
import re
#if you don't want to store the X and Y, just use re.compile('\d\s+\d+')
p = re.compile('(\d\s+\d+)|(X\s+Y)')
data={}
with open("input.txt") as file:
for line in file:
if line.startswith('DATASET:'):
fileTitle = line[9:].rstrip('\r\n')
if line.startswith("DATE:"):
fileDate = line[:]
print(fileTitle+fileDate)
if p.match(line):
if fileTitle not in data:
data[fileTitle]=[]
line=line.rstrip('\r\n')
data[fileTitle].append(line.split('\t'))
if len(data[fileTitle][len(data[fileTitle])-1]) == 3:
data[fileTitle][len(data[fileTitle])-1].pop()
print data
Yet another regex solution:
sep = '*************************\n'
pattern = r'DATASET[^%]*'
good_stuff = re.compile(pattern)
pattern = r'^DATASET: (.*?)$'
title = re.compile(pattern, flags = re.MULTILINE)
pattern = r'^DATE: (.*?)$'
date = re.compile(pattern, flags = re.MULTILINE)
with open(r'foo.txt') as f:
data = f.read()
for match in good_stuff.finditer(data):
data = match.group()
important_title = title.search(data).group(1)
important_date = date.search(data).group(1)
important_date = important_date.replace(r'/', '-')
fname = important_title + important_date + '.txt'
print(sep, fname)
print(data)
##with open(fname, 'w') as f:
## f.write(data)

Categories