computing lexical signatures of paragraphs (for "queer numbers")
kragen at pobox.com
kragen at pobox.com
Sat Oct 22 03:37:02 EDT 2005
#!/usr/bin/python
# Compute "lexical signatures" for each paragraph in a text document
# The idea is to compress these lossily into "queer numbers" to be
# used as fragment IDs. See my "queer numbers" post on kragen-tol.
# This Python program depends on generator expressions, which were
# introduced in Python 2.4. You can wrap the generator expressions in
# [] to turn them into list comprehensions to make them run in older
# Pythons.
import re, sys
def paragraphs(afile):
"Iterate over the paragraphs in a text file."
current_paragraph = ''
for line in afile:
if line.strip() == '':
if current_paragraph != '': yield current_paragraph
current_paragraph = ''
else:
current_paragraph += line
if current_paragraph != '': yield current_paragraph
def words(astring):
"Iterate over the words in a string, such as a paragraph."
for word in re.findall("['\w]+", astring): yield word.lower()
def freq(aseq):
"Compute a frequency table of each unique element in a sequence."
rv = {}
for item in aseq:
rv.setdefault(item, 0)
rv[item] += 1
return rv
def mergefreqs(freqs):
"Given a sequence of frequency tables, compute a summary table."
rv = {}
for freq in freqs:
for key, value in freq.iteritems():
rv.setdefault(key, 0)
rv[key] += value
return rv
def nonuniques(freqs):
"Cull a frequency table to remove things that occur only once."
return dict((k, v) for k, v in freqs.iteritems() if v > 1)
def count(freq):
"Given a frequency table, return the length of the sequence it represents."
return float(sum(freq.itervalues()))
def tfidf(freqs):
"""Compute TF/IDF over a sequence of frequency tables.
I'm not sure this is actually TF/IDF, because I'm not sure I know
what TF/IDF is.
"""
freqs = list(freqs)
totals = mergefreqs(freqs)
total_terms = count(totals)
for freq in freqs:
rv = {}
terms = count(freq)
for word, n in freq.iteritems():
tf = n / terms
idf = totals[word] / total_terms
rv[word] = tf/idf
yield rv
def sigs(freqs):
"""Compute 'lexical signatures' in the form of ordered word lists,
given a sequence of frequency tables to start with.
"""
for thingy in tfidf(freqs):
words = thingy.keys()
words.sort(key = thingy.get, reverse = True)
yield words
def display_sigs(infile):
"""Print a text summary of lexical signatures of some file."""
freqs = list(freq(words(para)) for para in paragraphs(infile))
hmm = nonuniques(mergefreqs(freqs))
ii = sigs(freqs)
max = 10
for line in ii:
print ' '.join(line[:max])
print '->', ' '.join([word for word in line
if hmm.has_key(word)][:max])
if __name__ == '__main__': display_sigs(file(sys.argv[1]))
More information about the Kragen-hacks
mailing list