six ways to do repr() with limited resource usage

Kragen Javier Sitaker kragen at pobox.com
Mon Feb 5 15:17:12 EST 2007


#!/usr/bin/python
# Various approaches to a "safe" repr that never generates too much text.
# Demonstrates various approaches to laziness in a Python setting.
# In order to be more comprehensible, doesn't cover every Python type,
# leaving out new-style classes in particular; and it doesn't even try
# to be object-oriented.

# I want to emphasize that this code is not part of any production
# code, and several of the implementations use a number of techniques
# that make it unnecessarily hard to understand.  There may be places
# where these techniques are called for, but they should be used very
# sparingly!

# (If you're programming in Haskell, you're probably using these
# techniques without even knowing it.  They cause less trouble in
# Haskell due to being implicit in the language.)

# I am writing this warning because in my early days as a programmer,
# I was enthusiastic about learning advanced new algorithms and ways
# of structuring programs, and I would often use them in contexts
# where they weren't really helpful.  In this case, the entire point
# of this module is to demonstrate different approaches to structuring
# a problem and understand their strengths and weaknesses; most
# programs have some more practical purpose that includes
# maintainability, which conflicts with complexity and unfamiliar
# program structures.

# If you need something like this module in Python in real life, you
# might try the TextRepr class inside pydoc.  It works reasonably well
# for most things, even though it's not too hard to get it to produce
# uselessly voluminous output:
# >>> yy = []
# >>> for ii in range(4): yy = [yy] * 20
# ... 
# >>> xx = pydoc.TextRepr().repr(yy)
# Or take several hours to run; just change the 4 to a larger number,
# such as 7.

import types

class UnhandledType(Exception): pass

### Dumb simple version: generate whole thing, then truncate it.  22 lines.
# Doesn't "work" in the sense of "never generating too much text", and
# it's trivial to make it painfully slow; for example:
# >>> x = 'x' * 10485760
# >>> for ii in range(100): x = [x]
# ... 
# >>> safe_repr.safe_repr_1(x, 200)
# "[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[['xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# It takes a few seconds to generate that result, as does repr(x)[:200].
# All the other implementations (except for safe_repr_2, which doesn't handle
# strings) generate it instantly.

def _safe_repr_1(val):
    if isinstance(val, types.ListType):
	return '[%s]' % ', '.join(map(_safe_repr_1, val))
    elif isinstance(val, types.TupleType):
	return '(%s)' % ', '.join(map(_safe_repr_1, val))
    elif isinstance(val, types.IntType):
	return '%d' % val
    elif isinstance(val, types.DictType):
	return '{%s}' % ', '.join(['%s: %s' % (_safe_repr_1(k), _safe_repr_1(v))
				   for k, v in val.items()])
    elif isinstance(val, types.InstanceType):
	# note that this is very different behavior from the standard
	# Python repr!
	return '%s %s' % (val.__class__.__name__, _safe_repr_1(val.__dict__))
    elif isinstance(val, types.StringType):
	return repr(val)  # I'm not going to bother writing out how that works
    else: 
	raise UnhandledType(val)

def safe_repr_1(val, length):
    "A dumb version that generates it all, then truncates.  For comparison."
    return _safe_repr_1(val)[:length]

### Badly programmed version: never append more than you
# need to.  24 lines, incomplete.

def safe_repr_2(val, length):
    """A partial version that never appends more than it needs to.  
    Too painful to complete."""
    rv = []
    if isinstance(val, types.ListType):
	if length > 0:
	    rv.append('[')
	    length -= 1
	for ii in range(len(val)):
	    item = val[ii]
	    if length <= 0: break
	    rv.append(safe_repr_2(item, length))
	    length -= len(rv[-1])
	    if ii < len(val) - 1:
		rv.append(', '[:length])
		length -= len(rv[-1])
	if length > 0:
	    rv.append(']')
	    length -= 1
    elif isinstance(val, types.IntType):
	rv.append(('%d' % val)[:length])
    else:
	raise UnhandledType(val)
    return ''.join(rv)

### Refactored version: never append more than you need to, and use an
# exception to quit when you've generated enough.  54 lines.

def _safe_repr_3_items(xform, items, buf):
    for ii in range(len(items)):
	xform(items[ii], buf)
	if ii < len(items) - 1: buf.append(', ')

def _safe_repr_3_dict_item((k, v), buf):
    _safe_repr_3(k, buf)
    buf.append(': ')
    _safe_repr_3(v, buf)

def _safe_repr_3(val, buf):
    if isinstance(val, types.ListType):
	buf.append('[')
	_safe_repr_3_items(_safe_repr_3, val, buf)
	buf.append(']')
    elif isinstance(val, types.TupleType):
	buf.append('(')
	_safe_repr_3_items(_safe_repr_3, val, buf)
	buf.append(')')
    elif isinstance(val, types.IntType):
	buf.append('%d' % val)
    elif isinstance(val, types.DictType):
	buf.append('{')
	_safe_repr_3_items(_safe_repr_3_dict_item, val.items(), buf)
	buf.append('}')
    elif isinstance(val, types.InstanceType):
	buf.append(val.__class__.__name__ + ' ')
	_safe_repr_3(val.__dict__, buf)
    elif isinstance(val, types.StringType):
	buf.append(repr(val))
    else:
	raise UnhandledType(val)

class BufferFull(Exception): pass
class Buf:
    def __init__(self, maxlen):
	self.maxlen = maxlen
	self.buf = []
	self.curlen = 0
    def append(self, astr):
	astr = astr[:self.maxlen - self.curlen]
	self.buf.append(astr)
	self.curlen += len(astr)
	if self.curlen == self.maxlen:
	    raise BufferFull()
    def contents(self):
	return ''.join(self.buf)

def safe_repr_3(val, length):
    "A version that uses an exception to break out of recursion."
    buf = Buf(length)
    try: _safe_repr_3(val, buf)
    except BufferFull: pass
    return buf.contents()

### Recursive Generator Version, in which we simply stop iterating
# when we have enough.  49 lines.

# This one is able to handle deeply nested data structures, as long as
# you don't ask it to traverse the whole thing; on my Python, that
# means you're safe as long as you don't want more than 499
# characters.

def _safe_repr_4_items(xform, items):
    for ii in range(len(items)):
	for item in xform(items[ii]): yield item
	if ii < len(items) - 1: yield (', ')

def _safe_repr_4_dict_item((k, v)):
    for item in _safe_repr_4(k): yield item
    yield ': '
    for item in _safe_repr_4(v): yield item

def _safe_repr_4(val):
    if isinstance(val, types.ListType):
	yield '['
	for item in _safe_repr_4_items(_safe_repr_4, val): yield item
	yield ']'
    elif isinstance(val, types.TupleType):
	yield '('
	for item in _safe_repr_4_items(_safe_repr_4, val): yield item
	yield ')'
    elif isinstance(val, types.IntType):
	yield '%d' % val
    elif isinstance(val, types.DictType):
	yield '{'
	for item in _safe_repr_4_items(_safe_repr_4_dict_item, val.items()):
	    yield item
	yield '}'
    elif isinstance(val, types.InstanceType):
	yield val.__class__.__name__ + ' '
	for item in _safe_repr_4(val.__dict__): yield item
    elif isinstance(val, types.StringType):
	# This time I will bother to write it out.
	yield "'"
	for each_char in val:
	    if each_char == '\\': yield '\\\\'
	    elif each_char == "'": yield "\\'"
	    else: yield each_char
	yield "'"
    else:
	raise UnhandledType(val)

def safe_repr_4(val, length):
    "A version using recursive generators."
    rv = []
    for chunk in _safe_repr_4(val):
	chunk = chunk[:length]
	length -= len(chunk)
	rv.append(chunk)
	if length == 0: break
    return ''.join(rv)

### Stream version without generators.  64 lines, plus 11 lines of streams.
# Here we see how we could do the same thing more painfully in a
# language with closures but without generators.  Surprisingly, this
# is only about 20% more code, if you discount the first four
# functions whose sole reason for being is to manipulate streams.
# However, it did have several bugs in it at first, so I think it's
# clearly harder to understand.

# At present, it dies with a stack overflow on deeply nested data
# structures, even when it shouldn't have to be traversing all the
# levels in order to produce the requested number of characters.  In
# that sense, it's "more eager" and "less lazy" than the generator
# version.  It's possible to make it lazier to solve that problem, but
# I haven't figured out how yet.

def stream_append(a, b):
    afirst, arest = a
    if afirst is None: return b
    return (afirst, lambda: stream_append(arest(), b))
empty_stream = (None, None)
def one_item_stream(x): return (x, lambda: empty_stream)
def stream_append_n(streams):
    streams.reverse()
    rv = empty_stream
    for stream in streams: rv = stream_append(stream, rv)
    return rv

def _safe_repr_5_map(xform, val, ii):
    if ii == len(val): return empty_stream
    else:
	return (xform(val[ii]), lambda: _safe_repr_5_map(xform, val, ii+1))

def _safe_repr_5_items(xform, val, ii):
    if ii == len(val): return empty_stream
    elif ii == len(val) - 1: return xform(val[ii])
    else:
	# we explicitly cons up the second stream here to maintain laziness
	# in case there are a lot of items
	return stream_append(xform(val[ii]),
            (", ", lambda: _safe_repr_5_items(xform, val, ii+1)))

def _safe_repr_5_dict_item((k, v)):
    return stream_append_n([_safe_repr_5(k), one_item_stream(": "),
			    _safe_repr_5(v)])

def _safe_repr_5(val):
    "A stream is a (first_chunk, stream) pair, or (None, dontcare) for empty."
    if isinstance(val, types.ListType):
	return stream_append_n([one_item_stream('['),
				_safe_repr_5_items(_safe_repr_5, val, 0),
				one_item_stream(']')])
    elif isinstance(val, types.TupleType):
	return stream_append_n([one_item_stream('('),
				_safe_repr_5_items(_safe_repr_5, val, 0),
				one_item_stream(')')])
    elif isinstance(val, types.IntType):
	return one_item_stream('%d' % val)
    elif isinstance(val, types.DictType):
	return stream_append_n([
		one_item_stream('{'),
		# note that we may lose some laziness here building a big list
		_safe_repr_5_items(_safe_repr_5_dict_item, val.items(), 0),
		one_item_stream('}')])
    elif isinstance(val, types.InstanceType):
	return stream_append(one_item_stream(val.__class__.__name__ + ' '),
			     _safe_repr_5(val.__dict__))
    elif isinstance(val, types.StringType):
	def chartrans(each_char):
	    if each_char == '\\': return '\\\\'
	    elif each_char == "'": return "\\'"
	    else: return each_char
	return stream_append_n([one_item_stream("'"),
            _safe_repr_5_map(chartrans, val, 0),
            one_item_stream("'")])				
	return one_item_stream(repr(val))
    else:
	raise UnhandledType(val)

def safe_repr_5(val, length):
    "A version using lazy streams."
    rv = []
    stream = _safe_repr_5(val)
    while 1:
	chunk, tail = stream
	if chunk is None: break
	chunk = chunk[:length]
	length -= len(chunk)
	rv.append(chunk)
	if length == 0: break
	stream = tail()
    return ''.join(rv)

### Explicit stack version, eschewing recursion and even functions.  61 lines.
# Unsurprisingly, this is nearly the longest version of all, although
# it had surprisingly few bugs.
# In a sense, it's "safer" than the other implementations because it 
# can handle data structures that are nested thousands of levels deep
# without trouble.

def safe_repr_6(val, length):
    "Explicit stack version, eschewing recursion and functions entirely."
    stack = [('val', val)]
    rv = []
    while stack:
	tag, val = stack.pop()
	to_add = None
	if tag == 'val':
	    if isinstance(val, types.ListType):
		to_add = '['
		stack.append(('literal', ']'))
		stack.append(('list_contents', (val, 0)))
	    elif isinstance(val, types.TupleType):
		to_add = '('
		stack.append(('literal', ')'))
		stack.append(('list_contents', (val, 0)))
	    elif isinstance(val, types.IntType):
		to_add = '%d' % val
	    elif isinstance(val, types.DictType):
		to_add = '{'
		stack.append(('literal', '}'))
		stack.append(('dict_contents', (val.items(), 0)))
	    elif isinstance(val, types.InstanceType):
		to_add = val.__class__.__name__ + ' '
		stack.append(('val', val.__dict__))
	    elif isinstance(val, types.StringType):
		to_add = "'"
		stack.append(('literal', "'"))
		stack.append(('string', (val, 0)))
	    else: 
		raise UnhandledType(val)
	elif tag == 'literal':
	    to_add = val
	elif tag == 'list_contents' or tag == 'dict_contents':
	    val, ii = val
	    if len(val) > 1 + ii:
		stack.append((tag, (val, ii + 1)))
		stack.append(('literal', ', '))
	    if len(val) > ii:
		item = val[ii]
		if tag == 'list_contents':
		    stack.append(('val', item))
		else:  # tag == 'dict_contents'
		    k, v = item
		    stack.append(('val', v))
		    stack.append(('literal', ': '))
		    stack.append(('val', k))
	elif tag == 'string':
	    val, ii = val
	    if len(val) > ii:
		stack.append((tag, (val, ii + 1)))
		each_char = val[ii]
		if each_char == '\\': to_add = '\\\\'
		elif each_char == "'": to_add = "\\'"
	        else: to_add = each_char
	if to_add is not None:
	    to_add = to_add[:length]
	    length -= len(to_add)
	    rv.append(to_add)
	    if length == 0: break
    return ''.join(rv)

### Unit tests, to verify that these functions all work.

def test_fun_on(fun, val):
    for length in range(40):
	rstr = fun(val, length)
	assert len(rstr) <= length, (rstr, val, length, len(rstr))
	assert rstr == safe_repr_1(val, length), (rstr, 
						  safe_repr_1(val, length))

def test_fun(fun):
    test_fun_on(fun, range(11))
    y = []
    for ii in range(10): y = [y]
    test_fun_on(fun, y)

def test_fun_harder(fun):
    class Foo: pass
    x = Foo()
    x.blarg = 37
    test_fun_on(fun, (x, {}, [(131, 'foo')]))

def test_all():
    test_fun(safe_repr_1)
    test_fun_harder(safe_repr_1)
    test_fun(safe_repr_2)

    test_fun(safe_repr_3)
    test_fun_harder(safe_repr_3)
    test_fun(safe_repr_4)
    test_fun_harder(safe_repr_4)
    test_fun(safe_repr_5)
    test_fun_harder(safe_repr_5)
    test_fun(safe_repr_6)
    test_fun_harder(safe_repr_6)

test_all()


More information about the Kragen-hacks mailing list