Browsing the OED by user-entered guidewords
Kragen Sitaker
kragen at pobox.com
Tue Nov 15 16:25:20 EST 2005
This is one of the stages of the project described at
http://lists.canonical.org/pipermail/kragen-tol/2005-October/000794.html.
This is currently running at http://considerate.murch-sitaker.org:8000/
and mostly seems to work OK technically --- the UI is a different
problem altogether. New users universally enter the word they're
looking for in the guidewords boxes. I haven't yet experimented with
ways to solve this user interface problem, but it clearly needs solving.
I'm also planning to use a Google-Maps-like interface to avoid having to
download the entire 750K JPEG to see any part of the full-res page.
I need to figure out how to put the gigabyte of page image data up on
the web.
#!/usr/bin/python
import twisted.web.resource, twisted.internet.reactor, twisted.web.static, sys
import os.path, nevow, twisted.web.error, urllib, random, string, pickle
import urlparse
T = nevow.tags
# To do:
# D put up on web
# D make reduced images <1024 pixels high
# kragen at considerate:/mnt/raid/kragen/media/oed-v$ time
# for x in newenglishdict05murrmiss_jpg/*.jpg; do
# convert -size 1024x1024 -resize 1024x1024 "$x"
# thumbnails/"$(basename "$x")";
# echo $x done;
# done
# D make navigation among these reduced images possible by wrapping them in
# pages
# D remember the words at the top of each image
# D support navigation by words
# - make much smaller thumbnails somehow --- copying 100 thumbnails to
# panacea only managed 40KB/s in 3:46, i.e. 226 seconds, or 2.26
# seconds each. Perhaps only the top 200 pixels or so, or 16KB each?
# D keep a file that knows the words at the top of each image
# - make a torrent for the images
# - make a streamlined data entry UI for guidewords?
# - remove "if it's in the book" when it's a headword
# - never redirect back to the same page
def flatten(stan): return str(nevow.flat.flatten(stan))
def dict_precedes(a, b):
def transformed(word):
word = word.lower()
return ''.join([letter for letter in word
if letter in string.ascii_letters])
return transformed(a) < transformed(b)
class Book:
def __init__(self, images, thumbnails, guidewordlog):
self.images = images
self.thumbnails = thumbnails
self.thumbnail_list = os.listdir(self.thumbnails)
self.guidewords_list = [None] * len(self.thumbnail_list)
self.guidewordlog = guidewordlog
self.read_guidewords()
def read_guidewords(self):
self.guidewordlog.seek(0)
try:
while 1:
pageno, first, last = pickle.load(self.guidewordlog)
self.guidewords_list[pageno] = first, last
except EOFError:
pass
def images_resource(self):
return twisted.web.static.File(self.images)
def thumbnails_resource(self):
return twisted.web.static.File(self.thumbnails)
def viewer_resource(self):
return ThumbnailNavigator(self)
def thumbnail_url(self, index):
return "/thumbnails/" + self.thumbnail_list[index]
def fullsize_url(self, index):
return "/images/" + self.thumbnail_list[index]
def add_guidewords(self, index, first, last):
self.guidewords_list[index] = first, last
print "Page", index, "has guidewords", first, "and", last
pickle.dump((index, first, last), self.guidewordlog)
self.guidewordlog.flush()
def get_guidewords(self, index):
return self.guidewords_list[index]
def should_be_on_page(self, index, word):
guidewords = self.get_guidewords(index)
if not guidewords: return False
first, last = guidewords
return (not dict_precedes(word, first) and
not dict_precedes(last, word))
def look_for(self, word):
last_before, first_after = 21, 1298 # XXX for OED vol V
word = word.lower()
for ii in xrange(len(self.guidewords_list)):
guidewords = self.get_guidewords(ii)
if not guidewords: continue
if self.should_be_on_page(ii, word): return ii
first, last = guidewords
if dict_precedes(last, word) and ii < first_after:
last_before = ii
if dict_precedes(word, first) and ii < first_after:
first_after = ii
if last_before < first_after - 1:
return int((last_before + first_after) / 2)
else:
# one of these must be wrong?
return random.choice([last_before, first_after])
class ThumbnailPage(twisted.web.resource.Resource):
def __init__(self, book, index):
twisted.web.resource.Resource.__init__(self)
self.book = book
self.index = index
def render_GET(self, req):
print "Displaying page", self.index
form = ''
if req.args.has_key('q'):
word = req.args['q'][0]
if word[0].lower() not in 'hijk':
warning = "(Not in this dictionary, which only covers HIJK.)"
elif self.book.should_be_on_page(self.index, word):
warning = "(Should be on this page if it's in this book.)"
else: warning = ''
first, last = '', ''
guidewords = self.book.get_guidewords(self.index)
if guidewords: first, last = guidewords
form = T.form(method="POST")[
T.p["Searching for ", T.b[word], ". ", T.b[warning]],
"Enter the guide words at the top of the page: ",
T.input(type="text", name="first_word", value=first),
T.input(type="text", name="last_word", value=last),
T.input(type="hidden", name="q", value=word),
T.input(type="submit", value="Update"),
]
return flatten(
T.html[T.head[T.title['Page number ', str(self.index)]],
T.body[
form,
T.script(type="text/javascript")[
"document.forms[0][0].focus()"
],
T.a(href=self.page_link(self.index-1, req))["Prev"], ' ',
T.a(href=self.book.fullsize_url(self.index))[
T.img(src=self.book.thumbnail_url(self.index),
align="top")
],
' ', T.a(href=self.page_link(self.index+1, req))["Next"],
],
])
def render_POST(self, req):
if req.args.has_key('first_word'):
first = req.args['first_word'][0]
last = req.args['last_word'][0] or first
self.book.add_guidewords(self.index, first, last)
base_url = req.prePathURL()
word = req.args['q'][0]
recommended_page_number = self.book.look_for(word)
newurl = str(recommended_page_number) + '?q=' + urllib.quote(word)
newurl = urlparse.urljoin(base_url, newurl)
req.redirect(newurl)
return '' # who cares about Opera 1.0?
def page_link(self, index, req):
if req.args.has_key('q'):
return '%d?q=%s' % (index, urllib.quote(req.args['q'][0]))
else: return str(index)
class ThumbnailNavigator(twisted.web.resource.Resource):
def __init__(self, book):
twisted.web.resource.Resource.__init__(self)
self.book = book
def getChild(self, childname, req):
try: index = int(childname)
except: return twisted.web.error.NoResource("Misspelled page number.")
return ThumbnailPage(self.book, index)
def render_GET(self, req):
return flatten(
T.html[T.head[T.title["OED Volume V"]],
T.body["See ", T.a(href="page/0")["the first page"],
' or search for a word from H to K. Searching may ',
'require that you type in the guidewords you see on ',
'up to six pages before you get to the correct page: ',
T.form(method='POST', action='page/1')[
T.input(name='q', value='hawk')
],
T.script(type="text/javascript")[
"document.forms[0][0].focus()"
],
]])
indexpage = flatten(
T.html[T.head[T.title["Index to OED volume V site"]],
T.body[T.h1["Index to OED volume V site"],
T.ul[
T.li[T.a(href="page")["My prototype viewer"]],
T.li[T.a(href="images/")["Archive's index page"]],
T.li[T.a(href="thumbnails/")["Raw thumbnails dir"]],
],
T.address[T.a(href="mailto:kragen at pobox.com")[
"Kragen Sitaker"]],
],
]
)
def ok(a, b): assert a == b, (a, b)
def test():
assert dict_precedes('ideologically', 'idiom')
assert dict_precedes('Ideologically', 'idiom')
assert dict_precedes('ideologically', 'Idiom')
assert dict_precedes('Had-I-Wist', 'Haematite')
assert dict_precedes('Hades', 'Had-I-Wist')
assert dict_precedes('Hackthorn', 'Haddock')
assert dict_precedes('Haddock', 'Hades')
assert dict_precedes('Hackthorn', 'Hades')
test()
def main(port):
root = twisted.web.resource.Resource()
imagebase = '.'
images = os.path.join(imagebase, 'newenglishdict05murrmiss_jpg')
assert os.path.exists(images)
thumbnails = os.path.join(imagebase, 'thumbnails')
guidewordlog = os.path.join(imagebase, 'guidewords')
book = Book(images, thumbnails, file(guidewordlog, 'r+'))
root.putChild('', twisted.web.static.Data(indexpage, 'text/html'))
root.putChild('images', book.images_resource())
root.putChild('thumbnails', book.thumbnails_resource())
root.putChild('page', book.viewer_resource())
twisted.internet.reactor.listenTCP(port,
twisted.web.server.Site(root))
print "listening", port
twisted.internet.reactor.run()
if __name__ == '__main__':
main(int(sys.argv[1]))
More information about the Kragen-hacks
mailing list