semistructured data land: sorting paragraphs by aisle number
kragen at pobox.com
kragen at pobox.com
Sat Nov 5 03:37:01 EST 2005
#!/usr/bin/python
"""Sort paragraphs in a text file according to a key defined by some
regular expression. I built this so I could sort the books I wanted
to get according to which aisle they were shelved in.
"""
import re, sys
def paragraphs(afile):
"Iterate over the paragraphs in a text file."
current_paragraph = ''
for line in afile:
if line.strip() == '':
if current_paragraph != '': yield current_paragraph
current_paragraph = ''
else:
current_paragraph += line
if current_paragraph != '': yield current_paragraph
def get_aisle(regexp):
def _(para):
mo = re.search(regexp, para)
if mo: return mo.group(1)
return _
def doit(infile, regexp):
paras = list(paragraphs(infile))
paras.sort(key=get_aisle(regexp))
print '\n'.join(paras)
if __name__ == '__main__':
doit(file(sys.argv[1]), len(sys.argv) > 2 and sys.argv[2] or r'isle (\d+)')
# ./aislesort.py ~/sdc1/kragen-pim/books 'by \w+ (\w+)'
# ./aislesort.py ~/sdc1/kragen-pim/books '_([\w ]+)_'
# ./aislesort.py ~/sdc1/kragen-pim/books '_(?:The )?([\w ]+)_'
# ./aislesort.py ~/sdc1/kragen-pim/books 'Recommended by (\w+)'
# ./aislesort.py ~/sdc1/kragen-pim/books '(\d\d\d\d-\d\d-\d\d)'
# ./aislesort.py ~/sdc1/kragen-pim/books '(\d+) pages'
# ./aislesort.py ~/sdc1/kragen-pim/books "(?i)(harpercollins|o'reilly|princeton|mcgraw-hill|sheffield hallam university press)"
# ./aislesort.py ~/sdc1/kragen-pim/books "ISBN\s+([-\d]+)"
More information about the Kragen-hacks
mailing list