read until end of file after a matching string - python

I am trying to readline after a match as from a file:
with open(jij, "a") as jout:
with open(jfile, "r") as jinp:
for line in jinp:
if line.strip().startswith("IQ"):
# for _ in line:
#for lines in jinp:
for lines in range(2500):
# lines = jinp.readline()
rows = jinp.readline().split()
print("{0:<3s}{1:<3s}{2:<3s}{3:<3s}{4:>3s}{5:>3s}{6:>3s}{7:>15s}{8:>7s}".
format(rows[3], rows[2], rows[0], rows[1], rows[4], rows[5], rows[6], rows[11], rows[10]))
A very short jfile is(I generaly have around 1000 lines, but it may be even bigger):
Isotropic exchange couplings Jij
number of sites NQ = 2
number of types NT = 2
site occupation:
1 1 1 1.000
2 1 2 1.000
IQ IT JQ JT N1 N2 N3 DRX DRY DRZ DR J_ij [mRy] J_ij [meV]
1 1 2 2 -1 -1 -1 -0.500 -0.500 -0.681 0.982 0.159317355 2.167623834
1 1 2 2 0 -1 -1 0.500 -0.500 -0.681 0.982 0.159317355 2.167623834
1 1 2 2 -1 0 -1 -0.500 0.500 -0.681 0.982 0.159317355 2.167623834
1 1 2 2 0 0 -1 0.500 0.500 -0.681 0.982 0.159317355 2.167623834
1 1 2 2 -1 -1 0 -0.500 -0.500 0.681 0.982 0.159317355 2.167623834
1 1 2 2 0 -1 0 0.500 -0.500 0.681 0.982 0.159317355 2.167623834
1 1 2 2 -1 0 0 -0.500 0.500 0.681 0.982 0.159317355 2.167623834
1 1 2 2 0 0 0 0.500 0.500 0.681 0.982 0.159317355 2.167623834
1 1 1 1 0 -1 0 0.000 -1.000 0.000 1.000 1.457569899 19.831256008
1 1 1 1 -1 0 0 -1.000 0.000 0.000 1.000 1.453728096 19.778985590
I am trying to print few elements as list after it finds "IQ".
My preferred way is to do it by for _ in line which is taking first 100 lines only; for lines in jinp is skipping one line, and reading the next line. It is only working as intended when I am putting it in range. But I don't want to put a fixed line number.
What is going wrong with for _ in line?
https://da.gd/CtKZ is the complete file.
https://da.gd/7V8F result with for lines in range(2500)
https://da.gd/6cx3 result with for _ in line
https://da.gd/v9ts result with for lines in jinp
Expected result is from range(2500), but I dont want to hardcode the line numbers.

Your problem is u reuse the same fd:
rows = jinp.readline().split()# This make the pointer point to next line
All your solutions have this line + another way iteration:
# for _ in line: go over the chars in the line (100)
#for lines in jinp: go over the open file - > so you read twice per iteration
You could use this, Shorter and more readable.
flag = False
with open(jij, "a") as jout:
with open(jfile, "r") as jinp:
for line in jinp:
if flag:
rows = line.split()
jout.write("{0:<3s}{1:<3s}{2:<3s}{3:<3s}{4:>3s}{5:>3s}{6:>3s}{7:>15s}{8:>7s}\n".
format(rows[3], rows[2], rows[0], rows[1], rows[4], rows[5], rows[6], rows[11],
rows[10]))
else:
flag = line.strip().startswith("IQ")

Related

Load files with two indexes in one dataframe

How to load 28 files with the same amount of rows and columns so it wont iterate index through all files data 0-2911, but only one file data with index 0-103 and give a second index 1-28 for every new file data started?
Here is the code that I wrote that iterates through all data:
import pandas as pd
import glob
path = r"C:/Users/Measurment_Data/Test_1"
all_files = glob.glob(path + "/*.dat")
li = []
for filename in all_files:
df = pd.read_csv(filename, sep="\t", names=["Voltage", "Current"], header=None)
li.append(df)
frame = pd.concat(li, axis = 0, ignore_index = True)
frame
Output:
ID Voltage Current
0 NaN 1.000000e+00
1 0.00 -3.047149e-06
2 0.04 -4.941096e-06
3 0.08 -4.472754e-06
4 0.12 -1.053477e-05
... ... ...
2907 -0.16 1.194359e-06
2908 -0.12 5.489425e-06
2909 -0.08 -9.656614e-09
2910 -0.04 -3.427169e-06
2911 -0.00 -2.173696e-06
I would like to have new indexes for every new loaded file. Something like this:
File ID Curr Volt
1 0 0.00 1.00E+00
1 1 0.00 -3.05E-06
1 2 0.04 -4.94E-06
...
1 102 0.08 -4.47E-06
1 103 0.12 -1.05E-05
...
2 0 0.00 2.00E+00
2 1 4.00 -3.05E-06
2 2 0.44 -3.94E-06
...
2 102 5.08 -6.47E-06
2 103 0.22 -6.05E-05
...
...
27 0 0.00 2.00E+00
27 1 4.00 -3.05E-06
27 2 0.44 -3.94E-06
...
27 102 5.08 -6.47E-06
27 103 0.22 -6.05E-05
...
28 0 0.00 2.00E+00
28 1 4.00 -3.05E-06
28 2 0.44 -3.94E-06
...
28 102 5.08 -6.47E-06
28 103 0.22 -6.05E-05
I would like to easily access the values of every file with index, so for example all values from 0-5 from 28 files.
Just define a new column after you read every file, then concatenate using default value of ignore_index:
import pandas as pd
import glob
path = r"C:/Users/Measurment_Data/Test_1"
all_files = glob.glob(path + "/*.dat")
li = []
j = 1
for filename in all_files:
df = pd.read_csv(filename, sep="\t", names=["Voltage", "Current"], header=None)
df.insert(0, 'File', '')
df["File"] = j
j += 1
li.append(df)
frame = pd.concat(li, axis = 0)
frame
Give it a try!

AttributeError: 'NoneType' object has no attribute 'text' - BeautifulShop

I have a little code for scraping info from fbref (link for data: https://fbref.com/en/comps/9/stats/Premier-League-Stats) and it worked well but now I have some problems with some features (I've checked that the fields which don't work now are"player","nationality","position","squad","age","birth_year"). I have also checked that the fields have the same name in the web that it used to be. Any ideas/help to solve the problem?
Many Thanks!
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import sys, getopt
import csv
def get_tables(url):
res = requests.get(url)
## The next two lines get around the issue with comments breaking the parsing.
comm = re.compile("<!--|-->")
soup = BeautifulSoup(comm.sub("",res.text),'lxml')
all_tables = soup.findAll("tbody")
team_table = all_tables[0]
player_table = all_tables[1]
return player_table, team_table
def get_frame(features, player_table):
pre_df_player = dict()
features_wanted_player = features
rows_player = player_table.find_all('tr')
for row in rows_player:
if(row.find('th',{"scope":"row"}) != None):
for f in features_wanted_player:
cell = row.find("td",{"data-stat": f})
a = cell.text.strip().encode()
text=a.decode("utf-8")
if(text == ''):
text = '0'
if((f!='player')&(f!='nationality')&(f!='position')&(f!='squad')&(f!='age')&(f!='birth_year')):
text = float(text.replace(',',''))
if f in pre_df_player:
pre_df_player[f].append(text)
else:
pre_df_player[f] = [text]
df_player = pd.DataFrame.from_dict(pre_df_player)
return df_player
stats = ["player","nationality","position","squad","age","birth_year","games","games_starts","minutes","goals","assists","pens_made","pens_att","cards_yellow","cards_red","goals_per90","assists_per90","goals_assists_per90","goals_pens_per90","goals_assists_pens_per90","xg","npxg","xa","xg_per90","xa_per90","xg_xa_per90","npxg_per90","npxg_xa_per90"]
def frame_for_category(category,top,end,features):
url = (top + category + end)
player_table, team_table = get_tables(url)
df_player = get_frame(features, player_table)
return df_player
top='https://fbref.com/en/comps/9/'
end='/Premier-League-Stats'
df1 = frame_for_category('stats',top,end,stats)
df1
I suggest loading the table with panda's read_html. There is a direct link to this table under Share & Export --> Embed this Table.
import pandas as pd
df = pd.read_html("https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F9%2Fstats%2FPremier-League-Stats&div=div_stats_standard", header=1)
This outputs a list of dataframes, the table can be accessed as df[0]. Output df[0].head():
Rk
Player
Nation
Pos
Squad
Age
Born
MP
Starts
Min
90s
Gls
Ast
G-PK
PK
PKatt
CrdY
CrdR
Gls.1
Ast.1
G+A
G-PK.1
G+A-PK
xG
npxG
xA
npxG+xA
xG.1
xA.1
xG+xA
npxG.1
npxG+xA.1
Matches
0
1
Patrick van Aanholt
nl NED
DF
Crystal Palace
30-190
1990
16
15
1324
14.7
0
1
0
0
0
1
0
0
0.07
0.07
0
0.07
1.2
1.2
0.8
2
0.08
0.05
0.13
0.08
0.13
Matches
1
2
Tammy Abraham
eng ENG
FW
Chelsea
23-156
1997
20
12
1021
11.3
6
1
6
0
0
0
0
0.53
0.09
0.62
0.53
0.62
5.6
5.6
0.9
6.5
0.49
0.08
0.57
0.49
0.57
Matches
2
3
Che Adams
eng ENG
FW
Southampton
24-237
1996
26
22
1985
22.1
5
4
5
0
0
1
0
0.23
0.18
0.41
0.23
0.41
5.5
5.5
4.3
9.9
0.25
0.2
0.45
0.25
0.45
Matches
3
4
Tosin Adarabioyo
eng ENG
DF
Fulham
23-164
1997
23
23
2070
23
0
0
0
0
0
1
0
0
0
0
0
0
1
1
0.1
1.1
0.04
0.01
0.05
0.04
0.05
Matches
4
5
Adrián
es ESP
GK
Liverpool
34-063
1987
3
3
270
3
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
Matches
If you're only after the player stats, change player_table = all_tables[1] to player_table = all_tables[2], because now you are feeding team table into get_frame function.
I tried it and it worked fine after that.

Using pandas dataframes, how to read through a column to find "True" statement and then proceed to create a new dataframe

Below I have 4 columns in my dataframe. I am interested in going through the entire "Greater_than_50" column. Upon reaching a "True" flag, I then want to take the associated "Discharge" and "Resistance" values to make a new dataframe which contains only those values found to be "True".
time Discharge Resistance Greater_than_50
-------------------------------------------------------------
0 0.000 NaN NaN
1 0.005 76.373 True
2 0.010 -48.174 False
3 0.016 -37.012 False
4 0.021 -27.808 False
5 0.026 -24.674 False
6 0.031 -20.464 False
7 0.037 100.114 True
... ... ... ...
I would like the new dataframe to look something like this:
Discharge Resistance
------------------------------
0.005 76.373
0.037 100.114
... ...
df['Greater_than_50'] = [val.strip() for val in df['Greater_than_50'].astype(str)]
# columns to keep
col_mask = ['Discharge', 'Resistance']
df_new = df.loc[df['Greater_than_50'] == 'True'][col_mask]
This is how I tested it:
'''
time Discharge Resistance Greater_than_50
0 0.000 NaN NaN
1 0.005 76.373 True
2 0.010 -48.174 False
3 0.016 -37.012 False
4 0.021 -27.808 False
5 0.026 -24.674 False
6 0.031 -20.464 False
7 0.037 100.114 True
'''
import pandas as pd
df = pd.read_clipboard()
print(df)
Original df:
time Discharge Resistance Greater_than_50
0 0 0.000 NaN NaN
1 1 0.005 76.373 True
2 2 0.010 -48.174 False
3 3 0.016 -37.012 False
4 4 0.021 -27.808 False
5 5 0.026 -24.674 False
6 6 0.031 -20.464 False
7 7 0.037 100.114 True
.
df['Greater_than_50'] = [val.strip() for val in df['Greater_than_50'].astype(str)]
# columns to keep
col_mask = ['Discharge', 'Resistance']
df_new = df.loc[df['Greater_than_50'] == 'True'][col_mask]
print(df_new)
Output:
Discharge Resistance
1 0.005 76.373
7 0.037 100.114
Just replace whatever columns you want to keep in the 'col_mask'.
Assuming the positions of columns "Discharge" and "Resistance" are 1 and 2, df2 is what you need
df1 = df[df.Greater_than_50 == True]
df2 = df1.iloc[:, 1:3]
You can do a one liner like so
df2 = df[df.Greater_than_50 == True].iloc[:, 1:3]

optimizing the for loop for faster performance

I have a dataframe that contains the similarity scores 100x100 for each 100 products against 100 products(data_neighbours). I have another dataframe that has the data at user and product level(1000x100). I want to go through each product for each user and get top10 similar products from data_neighbours and their corresponding similarity scores and compute a function getScore as below:
def getScore(history, similarities):
return sum(history*similarities)/sum(similarities)
for i in range(0,len(data_sims.index)):
for j in range(1,len(data_sims.columns)):
user = data_sims.index[i]
product = data_sims.columns[j]
if data.ix[i][j] == 1:
data_sims.ix[i][j] = 0
else:
product_top_names = data_neighbours.ix[product][1:10]
product_top_sims = data_ibs.ix[product].order(ascending=False)[1:10]
user_purchases = data_germany.ix[user,product_top_names]
data_sims.ix[i][j] = getScore(user_purchases,product_top_sims)
How can I optimize this loop for faster processing. The example has been cited from here: http://www.salemmarafi.com/code/collaborative-filtering-with-python/
Sample data:
Data:(1000x101) user is the 101th column:
Index user song1 song2.....
0 1 0 0
1 33 0 1
2 42 1 0
3 51 0 0
data_ibs(similarity scores)--(100x100):
song1 song2 song3 song4
song1 1.00 0.00 0.02 0.05
song2 0.00 1.00 0.05 0.03
song3 0.02 0.05 1.00 0.11
song4 0.05 0.03 0.11 1.00
data_neighbours(top10 similar songs for each song based on sorted score from data_ibs)--(100x10):
1 2 3......... 10
song1 song5 song10 song4
song2 song8 song11 song5
song3 song9 song12 song10
data germany(user level data for each song as column, except userid)--(1000x100):
index song1 song2 song3
1 0 0 0
2 1 0 0
3 0 0 1
Expected dataset(data_sims)--1000x101:
user song1 song2 song3
1 0.00 0.00 0.22
33 0.09 0.00 0.11
42 0.00 0.10 0.00
51 0.09 0.09 0.00
where if value is 1 in data for any song, basically its score is set to 0, other cases, top 10 songs are fetched from data_neighbours and corresponding scores from data_ibs. Now it is checked if those songs are already present for the user or not(1,0) in user_purchases dataset. finally similarity scores are computed for the ixj position using user_purchses(1/0 values for each top 10 song) multiply by similarity score from data_ibs and divide by the sum of total top 10 similarity scores. Repeat the same for all the user x song combination.

Python: Read and write the file of complex and reapeating format

To begin with, sorry for poor Engish.
I have a file with repeating format. Such as
326 Iteration: 0 #Bonds: 10
1 6 7 14 54 70 77 0 0 0 0 0 1 0.693 0.632 0.847 0.750 0.644 0.000 0.000 0.000 0.000 0.000 3.566 0.000 0.028
2 6 3 6 15 55 0 0 0 0 0 0 1 0.925 0.920 0.909 0.892 0.000 0.000 0.000 0.000 0.000 0.000 3.645 0.000 -0.040
3 6 2 8 10 52 0 0 0 0 0 0 1 0.925 0.910 0.920 0.898 0.000 0.000 0.000 0.000 0.000 0.000 3.653 0.000 0.000
...
324 8 323 0 0 0 0 0 0 0 0 0 100 0.871 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.871 3.000 -0.493
325 2 326 0 0 0 0 0 0 0 0 0 101 0.930 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.930 0.000 0.334
326 8 325 0 0 0 0 0 0 0 0 0 101 0.930 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.930 3.000 -0.611
637.916060425841 306.094529423257 1250.10511927236
6.782126993565285E-006
326 (repeating from here) Iteration: 100 #Bonds: 10
1 6 7 14 54 64 70 77 0 0 0 0 1 0.885 0.580 0.819 0.335 0.784 0.709 0.000 0.000 0.000 0.000 4.111 0.000 0.025
2 6 3 6 15 55 0 0 0 0 0 0 1 0.812 0.992 0.869 0.966 0.000 0.000 0.000 0.000 0.000 0.000 3.639 0.000 -0.034
3 6 2 8 10 52 0 0 0 0 0 0 1 0.812 0.966 0.989 0.926 0.000 0.000 0.000 0.000 0.000 0.000 3.692 0.000 0.004
As you can see here, the first line is the header, and 2nd~327th line is the data that I want to analyze, and 328th and 329th line have some numbers which I don't want to use. Next "frame" starts from line 330, with exactly same format. This "frame" repeats more than 200000 times.
I want to use 1st ~ 13th column from that 2nd~327th line data of each frames. Also I want to use first number of header.
I want to analyze the data, 3th~12th column of each 2nd~327th line of all repeating "frames", printing number of 0s and number of non-0s data from of target matrix of each frames. Also print some 1st, 2nd and 13th column as well. So the expected output file become like
326
1
1 6 5 5 1
2 6 4 6 1
...
325 2 1 9 101
326 8 1 9 101
326 (Next frame starts from here)
2
1 6 5 5 1
2 6 4 6 1
...
326
3
1 6 5 5 1
2 6 4 6 1
...
First line: First number of first line.
Second line: Frame number
3rd~328th line: 1st column of input file, 2nd column of input file, number of non-zeros of 3th~12th column of input, number of zeros of 3th~12th column of input, and 13th column of input.
From 4th line: repeating format, same with above.
So, the result file have 2 header line, and analyzed data of 326 lines, total 328 line per each frame. Same format repeats for next frame too. Using that format of result data (5 spaces each) is recommended to use the file for other purpose.
The way I'm using is, Creating 13 arrays for 13 columns -> store data using double for loops for each frame, and each 328 lines. But I have no idea how can I deal with output.
Following is the my trial code (unfinished, only for read the input), but this code have a lot of problems. Linecache reads whole line, not the first number of every first line. Every frame have 326+3=329 lines, but it seems like my code is not properly working for frame-wise workings. I welcomes any help and assist to analyze this data. Thank you very much in advance.
# Read the file
filename = raw_input("Enter the file name \n")
file = open(filename, 'r')
# Read the number of atom from header
import linecache
nnn = linecache.getline(filename, 1)
natoms = int(nnn)
singleframe = natoms + 3
# get number of frames
nlines = 0
for i1 in file:
nlines = nlines +1
file.close()
nframes = nlines / singleframe
print 'no of lines are: ', nlines
print 'no of frames are: ', nframes
print 'no of atoms are:', natoms
# Create 1d string array
nrange = range(nlines)
data_lines = [None]*(nlines)
# Store whole input file into string array
file = open(filename, 'r')
i1=0
for i1 in nrange:
data_lines[i1] = file.readline()
file.close()
# Create 1d array to store atomic data
at_index = [None]*natoms
at_type = [None]*natoms
n1 = [None]*natoms
n2 = [None]*natoms
n3 = [None]*natoms
n4 = [None]*natoms
n5 = [None]*natoms
n6 = [None]*natoms
n7 = [None]*natoms
n8 = [None]*natoms
n9 = [None]*natoms
n10 = [None]*natoms
molnr = [None]*natoms
nrange1= range(natoms)
nframe = range(nframes)
file = open('output_force','w')
print data_lines[9]
for j1 in nframe:
start = j1*(natoms + 3) + 3
for i1 in nrange1:
line = data_lines[i1+start].split() #Split each line based on spaces
at_index[i1] = int(line[0])
at_type[i1] = int(line[1])
n1[i1]= int(line[2])
n2[i1]= int(line[3])
n3[i1]= int(line[4])
n4[i1]= int(line[5])
n5[i1]= int(line[6])
n6[i1]= int(line[7])
n7[i1]= int(line[8])
n8[i1]= int(line[9])
n9[i1]= int(line[10])
n10[i1]= int(line[11])
molnr[i1]= int(line[12])
When you are working with csv files, you should look into the csv module. I wrote a code that are should do the trick.
This code assumes "good data". If your data set may contain errors (such as less columns than 13, or less data rows than 326) some alterations should be done.
(changed to comply with Python 2.6.6)
import csv
with open('mydata.csv') as in_file:
with open('outfile.csv', 'wb') as out_file:
csv_reader = csv.reader(in_file, delimiter=' ', skipinitialspace=True)
csv_writer = csv.writer(out_file, delimiter = '\t')
# Iterate over all rows in the file
for i, header in enumerate(csv_reader):
# Get the header data
num = header[0]
csv_writer.writerow([num])
# Write frame number, starting with 1 (hence the +1 part)
csv_writer.writerow([i+1])
# Iterate over all data rows
for _ in xrange(326):
# Call next(csv_reader) to get the next row
# Put inside a try ... except to avoid StopIteration exception
# if end of file is found before reaching 326 lines
try:
row = next(csv_reader)
except StopIteration:
break
# Use list comprehension to extract number of zeros
zeros = sum([1 for x in row[2:12] if x.strip() == '0'])
not_zeros = 10 - zeros
# Write the data to output file
out = [row[0].strip(), row[1].strip(),not_zeros, zeros, row[12].strip()]
csv_writer.writerow(out)
# If the
else:
# Skip the last two lines of the file
next(csv_reader)
next(csv_reader)
For the first three lines, this yields:
326
1
1 6 5 5 1
2 6 4 6 1
3 6 4 6 1

Categories