How to extract numbers from filename in Python? - python

I need to extract just the numbers from file names such as:
GapPoints1.shp
GapPoints23.shp
GapPoints109.shp
How can I extract just the numbers from these files using Python? I'll need to incorporate this into a for loop.

you can use regular expressions:
regex = re.compile(r'\d+')
Then to get the strings that match:
regex.findall(filename)
This will return a list of strings which contain the numbers. If you actually want integers, you could use int:
[int(x) for x in regex.findall(filename)]
If there's only 1 number in each filename, you could use regex.search(filename).group(0) (if you're certain that it will produce a match). If no match is found, the above line will produce a AttributeError saying that NoneType has not attribute group.

So, you haven't left any description of where these files are and how you're getting them, but I assume you'd get the filenames using the os module.
As for getting the numbers out of the names, you'd be best off using regular expressions with re, something like this:
import re
def get_numbers_from_filename(filename):
return re.search(r'\d+', filename).group(0)
Then, to include that in a for loop, you'd run that function on each filename:
for filename in os.listdir(myfiledirectory):
print get_numbers_from_filename(filename)
or something along those lines.

If there is just one number:
filter(lambda x: x.isdigit(), filename)

Hear is my code I used to bring the published year of a paper to the first of filename, after the file is downloaded from google scholar.
The main files usually are constructed so: Author+publishedYear.pdf hence, by implementing this code the filename will become: PublishedYear+Author.pdf.
# Renaming Pdf according to number extraction
# You want to rename a pdf file, so the digits of document published year comes first.
# Use regular expersion
# As long as you implement this file, the other pattern will be accomplished to your filename.
# import libraries
import re
import os
# Change working directory to this folder
address = os.getcwd ()
os.chdir(address)
# defining a class with two function
class file_name:
# Define a function to extract any digits
def __init__ (self, filename):
self.filename = filename
# Because we have tow pattern, we must define tow function.
# First function for pattern as : schrodinger1990.pdf
def number_extrction_pattern_non_digits_first (filename):
pattern = (r'(\D+)(\d+)(\.pdf)')
digits_pattern_non_digits_first = re.search(pattern, filename, re.IGNORECASE).group (2)
non_digits_pattern_non_digits_first = re.search(pattern, filename, re.IGNORECASE).group (1)
return digits_pattern_non_digits_first, non_digits_pattern_non_digits_first
# Second function for pattern as : 1993schrodinger.pdf
def number_extrction_pattern_digits_first (filename):
pattern = (r'(\d+)(\D+)(\.pdf)')
digits_pattern_digits_first = re.search(pattern, filename, re.IGNORECASE).group (1)
non_digits_pattern_digits_first = re.search(pattern, filename, re.IGNORECASE).group (2)
return digits_pattern_digits_first, non_digits_pattern_digits_first
if __name__ == '__main__':
# Define a pattern to check filename pattern
pattern_check1 = (r'(\D+)(\d+)(\.pdf)')
# Declare each file address.
for filename in os.listdir(address):
if filename.endswith('.pdf'):
if re.search(pattern_check1, filename, re.IGNORECASE):
digits = file_name.number_extrction_pattern_non_digits_first (filename)[0]
non_digits = file_name.number_extrction_pattern_non_digits_first (filename)[1]
os.rename(filename, digits + non_digits + '.pdf')
# Else other pattern exists.
else :
digits = file_name.number_extrction_pattern_digits_first (filename)[0]
non_digits = file_name.number_extrction_pattern_digits_first (filename)[1]
os.rename(filename, digits + non_digits + '.pdf')

Related

How to give a function access to a variable in another functions's inner for loop?

The program renames the files from American MM-DD-YYYY date format to European DD-MM-YYYY date format. I need somehow to pass the value of fileName in search_files function to the rename_file function so I can change the name of the file. Any idea how can I do that?
I think it may be possible to associate every fileName with it's new formatted name and to pass them as a dictionary. I didn't try this yet, but is there an easier way to do that?
def rename_file(europeanName):
# Get the full, absolute file paths.
currentPath = os.path.abspath('.')
fileName = os.path.join(currentPath, fileName)
europeanName = os.path.join(currentPath, europeanName)
# Rename the files.
shutil.move(fileName, europeanName)
def form_new_date(beforePart, monthPart, dayPart, yearPart, afterPart):
# Form the European-style filename.
europeanName = beforePart + dayPart + '-' + monthPart + '-' + yearPart + afterPart
rename_file(europeanName)
def breakdown_old_date(matches):
for match in matches:
# Get the different parts of the filename.
beforePart = match.group(1)
monthPart = match.group(2)
dayPart = match.group(4)
yearPart = match.group(6)
afterPart = match.group(8)
form_new_date(beforePart, monthPart, dayPart, yearPart, afterPart)
def search_files(dataPattern):
matches = []
# Loop over the files in the working directory.
for fileName in os.listdir('.'):
matchObj = dataPattern.search(fileName)
# Skip files without a date.
if not matchObj:
continue
else:
matches.append(matchObj)
breakdown_old_date(matches)
def form_regex():
# Create a regex that can identify the text pattern of American-style dates.
dataPattern = re.compile(r"""
^(.*?) # all text before the date
((0|1)?\d)- # one or two digits for the month
((0|1|2|3)?\d)- # one or two digits for the day
((19|20)\d\d) # four digits for the year
(.*?)$ # all text after the date
""", re.VERBOSE)
search_files(dataPattern)
if __name__ == "__main__":
form_regex()
Make matches a list of tuples, and for each file that matches:
matches.append((matchObj, fileName))
Then extract it out in breakdown_old_date using
fileName = match[1]
(don't forget to change your match.group calls to match[0].group), and pass it as a parameter to form_new_date, then as a parameter to rename_file.
Also, move the call to form_new_date (in breakdown_old_date) into the for loop, so it executes for each file you want to move.
(Alternatively, instead of making matches a list of tuples, you could make it a dictionary.)

Extract and modify substring from file path

I have a file path saved as filepath in the form of /home/user/filename. Some examples of what the filename could be:
'1990MAlogfile'
'Tantrologfile'
'2003RF_2004logfile'
I need to write something that turns the filepath into just part of the filename (but I do not have just the filename saved as anything yet). For example:
/home/user/1990MAlogfile becomes '1990 MA', /home/user/Tantrologfile becomes 'Tantro', or /home/user/2003RF_2004logfile becomes '2003 RF'.
So I need everything after the last forward slash and before an underscore if it's present (or before the 'logfile' if it's not), and then I need to insert a space between the last number and first letter if there are numbers present. Then I'd like to save the outcome as objkey. Any idea on how I could do this? I was thinking I could use regex, but don't know how I would handle inserting a space in those certain cases.
Code
def get_filename(filepath):
import re
temp = os.path.basename(example)[:-7].split('_')[0]
a = re.findall('^[0-9]*', temp)[0]
b = temp[len(a):]
return ' '.join([a, b])
example = '/home/user/2003RF_2004logfile'
objkey = get_filename(example)
Explanation
import regular expression package
import re
example filepath
example = '/home/user/2003RF_2004logfile'
/home/user/2003RF_2004logfile
get the filename and remove everything after the _
temp = example.split('/')[-1].split('_')[0]
2003RF
get the beginning portion (splits if numbers at the beginning)
a = re.findall('^[0-9]*', temp)[0]
2003
get the end portion
b = temp[len(a):]
RF
combine the beginning and end portions
return ' '.join([a, b])
2003 RF
import os, re, string
mystr = 'home/user/2003RF_2004logfile'
def format_str(str):
end = os.path.split(mystr)[-1]
m1 = re.match('(.+)logfile', end)
try:
this = m1.group(1)
this = this.split('_')[0]
except AttributeError:
return None
m2 = re.match('(.+[0-9])(.+)', this)
try:
return " ".join([m2.group(1), m2.group(2)])
except AttributeError:
return this

How to organize file names that were named as only numbers [duplicate]

Lets say I have three files in a folder: file9.txt, file10.txt and file11.txt and i want to read them in this particular order. Can anyone help me with this?
Right now I am using the code
import glob, os
for infile in glob.glob(os.path.join( '*.txt')):
print "Current File Being Processed is: " + infile
and it reads first file10.txt then file11.txt and then file9.txt.
Can someone help me how to get the right order?
Files on the filesystem are not sorted. You can sort the resulting filenames yourself using the sorted() function:
for infile in sorted(glob.glob('*.txt')):
print "Current File Being Processed is: " + infile
Note that the os.path.join call in your code is a no-op; with only one argument it doesn't do anything but return that argument unaltered.
Note that your files will sort in alphabetical ordering, which puts 10 before 9. You can use a custom key function to improve the sorting:
import re
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
for infile in sorted(glob.glob('*.txt'), key=numericalSort):
print "Current File Being Processed is: " + infile
The numericalSort function splits out any digits in a filename, turns it into an actual number, and returns the result for sorting:
>>> files = ['file9.txt', 'file10.txt', 'file11.txt', '32foo9.txt', '32foo10.txt']
>>> sorted(files)
['32foo10.txt', '32foo9.txt', 'file10.txt', 'file11.txt', 'file9.txt']
>>> sorted(files, key=numericalSort)
['32foo9.txt', '32foo10.txt', 'file9.txt', 'file10.txt', 'file11.txt']
You can wrap your glob.glob( ... ) expression inside a sorted( ... ) statement and sort the resulting list of files. Example:
for infile in sorted(glob.glob('*.txt')):
You can give sorted a comparison function or, better, use the key= ... argument to give it a custom key that is used for sorting.
Example:
There are the following files:
x/blub01.txt
x/blub02.txt
x/blub10.txt
x/blub03.txt
y/blub05.txt
The following code will produce the following output:
for filename in sorted(glob.glob('[xy]/*.txt')):
print filename
# x/blub01.txt
# x/blub02.txt
# x/blub03.txt
# x/blub10.txt
# y/blub05.txt
Now with key function:
def key_func(x):
return os.path.split(x)[-1]
for filename in sorted(glob.glob('[xy]/*.txt'), key=key_func):
print filename
# x/blub01.txt
# x/blub02.txt
# x/blub03.txt
# y/blub05.txt
# x/blub10.txt
EDIT:
Possibly this key function can sort your files:
pat=re.compile("(\d+)\D*$")
...
def key_func(x):
mat=pat.search(os.path.split(x)[-1]) # match last group of digits
if mat is None:
return x
return "{:>10}".format(mat.group(1)) # right align to 10 digits.
It sure can be improved, but I think you get the point. Paths without numbers will be left alone, paths with numbers will be converted to a string that is 10 digits wide and contains the number.
You need to change the sort from 'ASCIIBetical' to numeric by isolating the number in the filename. You can do that like so:
import re
def keyFunc(afilename):
nondigits = re.compile("\D")
return int(nondigits.sub("", afilename))
filenames = ["file10.txt", "file11.txt", "file9.txt"]
for x in sorted(filenames, key=keyFunc):
print xcode here
Where you can set filenames with the result of glob.glob("*.txt");
Additinally the keyFunc function assumes the filename will have a number in it, and that the number is only in the filename. You can change that function to be as complex as you need to isolate the number you need to sort on.
glob.glob(os.path.join( '*.txt'))
returns a list of strings, so you can easily sort the list using pythons sorted() function.
sorted(glob.glob(os.path.join( '*.txt')))
for fname in ['file9.txt','file10.txt','file11.txt']:
with open(fname) as f: # default open mode is for reading
for line in f:
# do something with line

Replace recursively from a replacement map

I have a dictionary in the form
{'from.x': 'from.changed.x',...}
possibly quite big, and I have to substitute in text files accordingly to that dictionary in a quite big directory structure.
I didn't find anything which might any nice solution and I end up:
using os.walk
iterating through the dictionary
writing everything out
WIth something like:
def fix_imports(top_dir, not_ui_keys):
"""Walk through the directory and substitute the wrong imports
"""
repl = {}
for n in not_ui_keys:
# interleave a model in between
dotted = extract_dotted(n)
if dotted:
repl[dotted] = add_model(dotted)
for root, dirs, files in walk(top_dir):
py_files = [path.join(root, x) for x in files if x.endswith('.py')]
for py in py_files:
res = replace_text(open(py).read(), repl)
def replace_text(orig_text, replace_map):
res = orig_text
# now try to grep all the keys, using a translate maybe
# with a dictionary of the replacements
for to_replace in replace_map:
res.replace(to_replace, replace_map[to_replace])
# now print the differences
for un in unified_diff(res.splitlines(), orig_text.splitlines()):
print(un)
return res
Is there any better/nicer/faster way to do it?
EDIT:
Clarifying a bit the problem, the substitution are generated from a function, and they are all in the form:
{'x.y.z': 'x.y.added.z', 'x.b.a': 'x.b.added.a'}
And yes, sure I should better use regexps, I just thought I didn't need them this time.
I don't think it can help much, however, because I can't really formalize the whole range of substitutions with only one (or multiple) regexps..
I would write the first function using generators:
def fix_imports(top_dir, not_ui_keys):
"""Walk through the directory and substitute the wrong imports """
from itertools import imap,ifilter
gen = ifilter(None,imap(extract_dotted, not_ui_keys))
repl = dict((dotted,add_model(dotted)) for dotted in gen)
py_files = (path.join(root, x)
for root, dirs, files in walk(top_dir)
for x in files if x[-3:]=='.py')
for py in py_files:
with open(py) as opf:
res = replace_text(opf.read(), repl)
x[-3:]=='.py' is faster than x.endswith('.py')
Thank you everyone, and about the problem of substituting from a mapping in many files, I think I have a working solution:
def replace_map_to_text(repl_map, text_lines):
"""Take a dictionary with the replacements needed and a list of
files and return a list with the substituted lines
"""
res = []
concat_st = "(%s)" % "|".join(repl_map.keys())
# '.' in non raw regexp means one of any characters, so must be
# quoted ore we need a way to make the string a raw string
concat_st = concat_st.replace('.', '\.')
combined_regexp = re.compile(concat_st)
for line in text_lines:
found = combined_regexp.search(line)
if found:
expr = found.group(1)
new_line = re.sub(expr, repl_map[expr], line)
logger.info("from line %s to line %s" % (line, new_line))
res.append(new_line)
else:
res.append(line)
return res
def test_replace_string():
lines = ["from psi.io.api import x",
"from psi.z import f"]
expected = ["from psi.io.model.api import x",
"from psi.model.z import f"]
mapping = {'psi.io.api': 'psi.io.model.api',
'psi.z': 'psi.model.z'}
assert replace_map_to_text(mapping, lines) == expected
In short I compose a big regexp in the form
(first|second|third)
Then I search for it in every line and substitute with re.sub if something was found.
Still a bit rough but the simple test after works fine.
EDIT: fixed a nasty bug in the concatenation, because if it's not a raw string '.' means only one character, not a '.'

batch renaming 100K files with python

I have a folder with over 100,000 files, all numbered with the same stub, but without leading zeros, and the numbers aren't always contiguous (usually they are, but there are gaps) e.g:
file-21.png,
file-22.png,
file-640.png,
file-641.png,
file-642.png,
file-645.png,
file-2130.png,
file-2131.png,
file-3012.png,
etc.
I would like to batch process this to create padded, contiguous files. e.g:
file-000000.png,
file-000001.png,
file-000002.png,
file-000003.png,
When I parse the folder with for filename in os.listdir('.'): the files don't come up in the order I'd like to them to. Understandably they come up
file-1,
file-1x,
file-1xx,
file-1xxx,
etc. then
file-2,
file-2x,
file-2xx,
etc. How can I get it to go through in the order of the numeric value? I am a complete python noob, but looking at the docs i'm guessing I could use map to create a new list filtering out only the numerical part, and then sort that list, then iterate that? With over 100K files this could be heavy. Any tips welcome!
import re
thenum = re.compile('^file-(\d+)\.png$')
def bynumber(fn):
mo = thenum.match(fn)
if mo: return int(mo.group(1))
allnames = os.listdir('.')
allnames.sort(key=bynumber)
Now you have the files in the order you want them and can loop
for i, fn in enumerate(allnames):
...
using the progressive number i (which will be 0, 1, 2, ...) padded as you wish in the destination-name.
There are three steps. The first is getting all the filenames. The second is converting the filenames. The third is renaming them.
If all the files are in the same folder, then glob should work.
import glob
filenames = glob.glob("/path/to/folder/*.txt")
Next, you want to change the name of the file. You can print with padding to do this.
>>> filename = "file-338.txt"
>>> import os
>>> fnpart = os.path.splitext(filename)[0]
>>> fnpart
'file-338'
>>> _, num = fnpart.split("-")
>>> num.rjust(5, "0")
'00338'
>>> newname = "file-%s.txt" % num.rjust(5, "0")
>>> newname
'file-00338.txt'
Now, you need to rename them all. os.rename does just that.
os.rename(filename, newname)
To put it together:
for filename in glob.glob("/path/to/folder/*.txt"): # loop through each file
newname = make_new_filename(filename) # create a function that does step 2, above
os.rename(filename, newname)
Thank you all for your suggestions, I will try them all to learn the different approaches. The solution I went for is based on using a natural sort on my filelist, and then iterating that to rename. This was one of the suggested answers but for some reason it has disappeared now so I cannot mark it as accepted!
import os
files = os.listdir('.')
natsort(files)
index = 0
for filename in files:
os.rename(filename, str(index).zfill(7)+'.png')
index += 1
where natsort is defined in http://code.activestate.com/recipes/285264-natural-string-sorting/
Why don't you do it in a two step process. Parse all the files and rename with padded numbers and then run another script that takes those files, which are sorted correctly now, and renames them so they're contiguous?
1) Take the number in the filename.
2) Left-pad it with zeros
3) Save name.
def renamer():
for iname in os.listdir('.'):
first, second = iname.replace(" ", "").split("-")
number, ext = second.split('.')
first, number, ext = first.strip(), number.strip(), ext.strip()
number = '0'*(6-len(number)) + number # pad the number to be 7 digits long
oname = first + "-" + number + '.' + ext
os.rename(iname, oname)
print "Done"
Hope this helps
The simplest method is given below. You can also modify for recursive search this script.
use os module.
get filenames
os.rename
import os
class Renamer:
def __init__(self, pattern, extension):
self.ext = extension
self.pat = pattern
return
def rename(self):
p, e = (self.pat, self.ext)
number = 0
for x in os.listdir(os.getcwd()):
if str(x).endswith(f".{e}") == True:
os.rename(x, f'{p}_{number}.{e}')
number+=1
return
if __name__ == "__main__":
pattern = "myfile"
extension = "txt"
r = Renamer(pattern=pattern, extension=extension)
r.rename()

Categories