Files
codeapi-new/pg_encoder.py
2025-10-21 21:26:56 +08:00

590 lines
24 KiB
Python

# Online Python Tutor
# https://github.com/pgbovine/OnlinePythonTutor/
#
# Copyright (C) Philip J. Guo (philip@pgbovine.net)
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# Thanks to John DeNero for making the encoder work on both Python 2 and 3
# (circa 2012-2013)
# Given an arbitrary piece of Python data, encode it in such a manner
# that it can be later encoded into JSON.
# http://json.org/
#
# We use this function to encode run-time traces of data structures
# to send to the front-end.
#
# Format:
# Primitives:
# * None, int, long, float, str, bool - unchanged
# (json.dumps encodes these fine verbatim, except for inf, -inf, and nan)
#
# exceptions: float('inf') -> ['SPECIAL_FLOAT', 'Infinity']
# float('-inf') -> ['SPECIAL_FLOAT', '-Infinity']
# float('nan') -> ['SPECIAL_FLOAT', 'NaN']
# x == int(x) -> ['SPECIAL_FLOAT', '%.1f' % x]
# (this way, 3.0 prints as '3.0' and not as 3, which looks like an int)
#
# If render_heap_primitives is True, then primitive values are rendered
# on the heap as ['HEAP_PRIMITIVE', <type name>, <value>]
#
# (for SPECIAL_FLOAT values, <value> is a list like ['SPECIAL_FLOAT', 'Infinity'])
#
# added on 2018-06-13:
# ['IMPORTED_FAUX_PRIMITIVE', <label>] - renders externally imported objects
# like they were primitives, to save
# space and to prevent from having to
# recurse into of them to see internals
#
# Compound objects:
# * list - ['LIST', elt1, elt2, elt3, ..., eltN]
# * tuple - ['TUPLE', elt1, elt2, elt3, ..., eltN]
# * set - ['SET', elt1, elt2, elt3, ..., eltN]
# * dict - ['DICT', [key1, value1], [key2, value2], ..., [keyN, valueN]]
# * instance - ['INSTANCE', class name, [attr1, value1], [attr2, value2], ..., [attrN, valueN]]
# * instance with non-trivial __str__ defined - ['INSTANCE_PPRINT', class name, <__str__ value>, [attr1, value1], [attr2, value2], ..., [attrN, valueN]]
# * class - ['CLASS', class name, [list of superclass names], [attr1, value1], [attr2, value2], ..., [attrN, valueN]]
# * function - ['FUNCTION', function name, parent frame ID (for nested functions),
# [*OPTIONAL* list of pairs of default argument names/values] ] <-- final optional element added on 2018-06-13
# * module - ['module', module name]
# * other - [<type name>, string representation of object]
# * compound object reference - ['REF', target object's unique_id]
#
# the unique_id is derived from id(), which allows us to capture aliasing
# number of significant digits for floats
FLOAT_PRECISION = 4
from collections import defaultdict
import re, types
import sys
import math
typeRE = re.compile("<type '(.*)'>")
classRE = re.compile("<class '(.*)'>")
import inspect
# TODO: maybe use the 'six' library to smooth over Py2 and Py3 incompatibilities?
is_python3 = sys.version_info[0] == 3
if is_python3:
# avoid name errors (GROSS!)
long = int
unicode = str
def is_class(dat):
"""Return whether dat is a class."""
if is_python3:
return isinstance(dat, type)
else:
return type(dat) in (types.ClassType, types.TypeType)
def is_instance(dat):
"""Return whether dat is an instance of a class."""
if is_python3:
return (
type(dat) not in PRIMITIVE_TYPES
and isinstance(type(dat), type)
and not isinstance(dat, type)
)
else:
# ugh, classRE match is a bit of a hack :(
return type(dat) == types.InstanceType or classRE.match(str(type(dat)))
def get_name(obj):
"""Return the name of an object."""
return obj.__name__ if hasattr(obj, "__name__") else get_name(type(obj))
PRIMITIVE_TYPES = (int, long, float, str, unicode, bool, type(None))
def encode_primitive(dat):
t = type(dat)
if t is float:
if math.isinf(dat):
if dat > 0:
return ["SPECIAL_FLOAT", "Infinity"]
else:
return ["SPECIAL_FLOAT", "-Infinity"]
elif math.isnan(dat):
return ["SPECIAL_FLOAT", "NaN"]
else:
# render floats like 3.0 as '3.0' and not as 3
if dat == int(dat):
return ["SPECIAL_FLOAT", "%.1f" % dat]
else:
return round(dat, FLOAT_PRECISION)
elif t is str and (not is_python3):
# hack only for Python 2 strings ... always turn into unicode
# and display '?' when it's not valid unicode
return dat.decode("utf-8", "replace")
else:
# return all other primitives verbatim
return dat
# grab a line number like ' <line 2>' or ' <line 2b>'
def create_lambda_line_number(codeobj, line_to_lambda_code):
try:
lambda_lineno = codeobj.co_firstlineno
lst = line_to_lambda_code[lambda_lineno]
ind = lst.index(codeobj)
# add a suffix for all subsequent lambdas on a line beyond the first
# (nix this for now because order isn't guaranteed when you have
# multiple lambdas on the same line)
"""
if ind > 0:
lineno_str = str(lambda_lineno) + chr(ord('a') + ind)
else:
lineno_str = str(lambda_lineno)
"""
lineno_str = str(lambda_lineno)
return " <line " + lineno_str + ">"
except:
return ""
# Note that this might BLOAT MEMORY CONSUMPTION since we're holding on
# to every reference ever created by the program without ever releasing
# anything!
class ObjectEncoder:
def __init__(self, parent):
self.parent = parent # should be a PGLogger object
# Key: canonicalized small ID
# Value: encoded (compound) heap object
self.encoded_heap_objects = {}
self.render_heap_primitives = parent.render_heap_primitives
self.id_to_small_IDs = {}
self.cur_small_ID = 1
# wow, creating unique identifiers for lambdas is quite annoying,
# especially if we want to properly differentiate:
# 1.) multiple lambdas defined on the same line, and
# 2.) the same lambda code defined multiple times on different lines
#
# However, it gets confused when there are multiple identical
# lambdas on the same line, like:
# f(lambda x:x*x, lambda y:y*y, lambda x:x*x)
# (assumes everything is in one file)
# Key: line number
# Value: list of the code objects of lambdas defined
# on that line in the order they were defined
self.line_to_lambda_code = defaultdict(list)
def should_hide_var(self, var):
return self.parent.should_hide_var(var)
# searches through self.parents.types_to_inline and tries
# to match the type returned by type(obj).__name__ and
# also 'class' and 'instance' for classes and instances, respectively
def should_inline_object_by_type(self, obj):
# fast-pass optimization -- common case
if not self.parent.types_to_inline:
return False
# copy-pasted from the end of self.encode()
typ = type(obj)
typename = typ.__name__
# pick up built-in functions too:
if typ in (
types.FunctionType,
types.MethodType,
types.BuiltinFunctionType,
types.BuiltinMethodType,
):
typename = "function"
if not typename:
return False
alt_typename = None
if is_class(obj):
alt_typename = "class"
elif is_instance(obj) and typename != "function":
# if obj is an instance of the Fooo class, then we want to match
# on both 'instance' and 'Fooo'
# (exception: 'function' objects are sometimes also instances,
# but we still want to call them 'function', so ignore them)
typename = "instance"
class_name = None
if hasattr(obj, "__class__"):
# common case ...
class_name = get_name(obj.__class__)
else:
# super special case for something like
# "from datetime import datetime_CAPI" in Python 3.2,
# which is some weird 'PyCapsule' type ...
# http://docs.python.org/release/3.1.5/c-api/capsule.html
class_name = get_name(type(obj))
alt_typename = class_name
for re_match in self.parent.types_to_inline:
if re_match(typename):
return True
if alt_typename and re_match(alt_typename):
return True
return False
def get_heap(self):
return self.encoded_heap_objects
def reset_heap(self):
# VERY IMPORTANT to reassign to an empty dict rather than just
# clearing the existing dict, since get_heap() could have been
# called earlier to return a reference to a previous heap state
self.encoded_heap_objects = {}
def set_function_parent_frame_ID(self, ref_obj, enclosing_frame_id):
assert ref_obj[0] == "REF"
func_obj = self.encoded_heap_objects[ref_obj[1]]
assert func_obj[0] == "FUNCTION"
func_obj[-1] = enclosing_frame_id
# return either a primitive object or an object reference;
# and as a side effect, update encoded_heap_objects
def encode(self, dat, get_parent):
"""Encode a data value DAT using the GET_PARENT function for parent ids."""
# primitive type
if not self.render_heap_primitives and type(dat) in PRIMITIVE_TYPES:
return encode_primitive(dat)
# compound type - return an object reference and update encoded_heap_objects
else:
# IMPORTED_FAUX_PRIMITIVE feature added on 2018-06-13:
is_externally_defined = (
False # is dat defined in external (i.e., non-user) code?
)
try:
# some objects don't return anything for getsourcefile() but DO return
# something legit for getmodule(). e.g., "from io import StringIO"
# so TRY getmodule *first* and then fall back on getsourcefile
# since getmodule seems more robust empirically ...
gsf = inspect.getmodule(dat).__file__
if not gsf:
gsf = inspect.getsourcefile(dat)
# a hacky heuristic is that if gsf is an absolute path, then it's likely
# to be some library function and *not* in user-defined code
#
# NB: don't use os.path.isabs() since it doesn't work on some
# python installations (e.g., on my webserver) and also adds a
# dependency on the os module. just do a simple check:
#
# hacky: do other checks for strings that are indicative of files
# that load user-written code, like 'generate_json_trace.py'
if gsf and gsf[0] == "/" and "generate_json_trace.py" not in gsf:
is_externally_defined = True
except (AttributeError, TypeError):
pass # fail soft
my_id = id(dat)
# if dat is an *real* object instance (and not some special built-in one
# like ABCMeta, or a py3 function object), then DON'T treat it as
# externally-defined because a user might be instantiating an *instance*
# of an imported class in their own code, so we want to show that instance
# in da visualization - ugh #hacky
if (
is_instance(dat)
and type(dat)
not in (
types.FunctionType,
types.MethodType,
types.BuiltinFunctionType,
types.BuiltinMethodType,
)
and hasattr(dat, "__class__")
and (get_name(dat.__class__) != "ABCMeta")
):
is_externally_defined = False
# if this is an externally-defined object (i.e., from an imported
# module, don't try to recurse into it since we don't want to see
# the internals of imported objects; just return an
# IMPORTED_FAUX_PRIMITIVE object and continue along on our way
if is_externally_defined:
label = "object"
try:
label = type(dat).__name__
if is_class(dat):
label = "class"
elif is_instance(dat):
label = "object"
except:
pass
return ["IMPORTED_FAUX_PRIMITIVE", "imported " + label] # punt early!
# next check whether it should be inlined
if self.should_inline_object_by_type(dat):
label = "object"
try:
label = type(dat).__name__
if is_class(dat):
class_name = get_name(dat)
label = class_name + " class"
elif is_instance(dat):
# a lot of copy-pasta from other parts of this file:
# TODO: clean up
class_name = None
if hasattr(dat, "__class__"):
# common case ...
class_name = get_name(dat.__class__)
else:
# super special case for something like
# "from datetime import datetime_CAPI" in Python 3.2,
# which is some weird 'PyCapsule' type ...
# http://docs.python.org/release/3.1.5/c-api/capsule.html
class_name = get_name(type(dat))
if class_name:
label = class_name + " instance"
else:
label = "instance"
except:
pass
return ["IMPORTED_FAUX_PRIMITIVE", label + " (hidden)"] # punt early!
try:
my_small_id = self.id_to_small_IDs[my_id]
except KeyError:
my_small_id = self.cur_small_ID
self.id_to_small_IDs[my_id] = self.cur_small_ID
self.cur_small_ID += 1
del my_id # to prevent bugs later in this function
ret = ["REF", my_small_id]
# punt early if you've already encoded this object
if my_small_id in self.encoded_heap_objects:
return ret
# major side-effect!
new_obj = []
self.encoded_heap_objects[my_small_id] = new_obj
typ = type(dat)
if typ == list:
new_obj.append("LIST")
for e in dat:
new_obj.append(self.encode(e, get_parent))
elif typ == tuple:
new_obj.append("TUPLE")
for e in dat:
new_obj.append(self.encode(e, get_parent))
elif typ == set:
new_obj.append("SET")
for e in dat:
new_obj.append(self.encode(e, get_parent))
elif typ == dict:
new_obj.append("DICT")
for k, v in dat.items():
# don't display some built-in locals ...
if k not in ("__module__", "__return__", "__locals__"):
new_obj.append(
[self.encode(k, get_parent), self.encode(v, get_parent)]
)
elif typ in (types.FunctionType, types.MethodType):
if is_python3:
argspec = inspect.getfullargspec(dat)
else:
argspec = inspect.getargspec(dat)
printed_args = [e for e in argspec.args]
default_arg_names_and_vals = []
if argspec.defaults:
num_missing_defaults = len(printed_args) - len(argspec.defaults)
assert num_missing_defaults >= 0
# tricky tricky tricky how default positional arguments work!
for i in range(num_missing_defaults, len(printed_args)):
default_arg_names_and_vals.append(
(
printed_args[i],
self.encode(
argspec.defaults[i - num_missing_defaults],
get_parent,
),
)
)
if argspec.varargs:
printed_args.append("*" + argspec.varargs)
if is_python3:
# kwonlyargs come before varkw
if argspec.kwonlyargs:
printed_args.extend(argspec.kwonlyargs)
if argspec.kwonlydefaults:
# iterate in order of appearance in kwonlyargs
for varname in argspec.kwonlyargs:
if varname in argspec.kwonlydefaults:
val = argspec.kwonlydefaults[varname]
default_arg_names_and_vals.append(
(varname, self.encode(val, get_parent))
)
if argspec.varkw:
printed_args.append("**" + argspec.varkw)
else:
if argspec.keywords:
printed_args.append("**" + argspec.keywords)
func_name = get_name(dat)
pretty_name = func_name
# sometimes might fail for, say, <genexpr>, so just ignore
# failures for now ...
try:
pretty_name += "(" + ", ".join(printed_args) + ")"
except TypeError:
pass
# put a line number suffix on lambdas to more uniquely identify
# them, since they don't have names
if func_name == "<lambda>":
cod = dat.__code__ if is_python3 else dat.func_code # ugh!
lst = self.line_to_lambda_code[cod.co_firstlineno]
if cod not in lst:
lst.append(cod)
pretty_name += create_lambda_line_number(
cod, self.line_to_lambda_code
)
encoded_val = ["FUNCTION", pretty_name, None]
if get_parent:
enclosing_frame_id = get_parent(dat)
encoded_val[2] = enclosing_frame_id
new_obj.extend(encoded_val)
# OPTIONAL!!!
if default_arg_names_and_vals:
new_obj.append(
default_arg_names_and_vals
) # *append* it as a single list element
elif typ is types.BuiltinFunctionType:
pretty_name = get_name(dat) + "(...)"
new_obj.extend(["FUNCTION", pretty_name, None])
elif is_class(dat) or is_instance(dat):
self.encode_class_or_instance(dat, new_obj)
elif typ is types.ModuleType:
new_obj.extend(["module", dat.__name__])
elif typ in PRIMITIVE_TYPES:
assert self.render_heap_primitives
new_obj.extend(
["HEAP_PRIMITIVE", type(dat).__name__, encode_primitive(dat)]
)
else:
typeStr = str(typ)
m = typeRE.match(typeStr)
if not m:
m = classRE.match(typeStr)
assert m, typ
if is_python3:
encoded_dat = str(dat)
else:
# ugh, for bytearray() in Python 2, str() returns
# non-JSON-serializable characters, so need to decode:
encoded_dat = str(dat).decode("utf-8", "replace")
new_obj.extend([m.group(1), encoded_dat])
return ret
def encode_class_or_instance(self, dat, new_obj):
"""Encode dat as a class or instance."""
if is_instance(dat):
if hasattr(dat, "__class__"):
# common case ...
class_name = get_name(dat.__class__)
else:
# super special case for something like
# "from datetime import datetime_CAPI" in Python 3.2,
# which is some weird 'PyCapsule' type ...
# http://docs.python.org/release/3.1.5/c-api/capsule.html
class_name = get_name(type(dat))
pprint_str = None
# do you or any of your superclasses have a __str__ field? if so, pretty-print yourself!
if hasattr(dat, "__str__"):
try:
pprint_str = dat.__str__()
# sometimes you'll get 'trivial' pprint_str like: '<__main__.MyObj object at 0x10f465cd0>'
# or '<module 'collections' ...'
# IGNORE THOSE!!!
if (
pprint_str[0] == "<"
and pprint_str[-1] == ">"
and (" at " in pprint_str or pprint_str.startswith("<module"))
):
pprint_str = None
except:
pass
# TODO: filter for trivial-looking pprint_str like those produced
# by object.__str__
if pprint_str:
new_obj.extend(["INSTANCE_PPRINT", class_name, pprint_str])
else:
new_obj.extend(["INSTANCE", class_name])
# don't traverse inside modules, or else risk EXPLODING the visualization
if class_name == "module":
return
else:
superclass_names = [e.__name__ for e in dat.__bases__ if e is not object]
new_obj.extend(["CLASS", get_name(dat), superclass_names])
# traverse inside of its __dict__ to grab attributes
# (filter out useless-seeming ones, based on anecdotal observation):
hidden = (
"__doc__",
"__module__",
"__return__",
"__dict__",
"__locals__",
"__weakref__",
"__qualname__",
)
if hasattr(dat, "__dict__"):
user_attrs = sorted([e for e in dat.__dict__ if e not in hidden])
else:
user_attrs = []
for attr in user_attrs:
if not self.should_hide_var(attr):
new_obj.append(
[self.encode(attr, None), self.encode(dat.__dict__[attr], None)]
)