codeapi-new/pg_encoder.py

# Online Python Tutor
# https://github.com/pgbovine/OnlinePythonTutor/
#
# Copyright (C) Philip J. Guo (philip@pgbovine.net)
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

# Thanks to John DeNero for making the encoder work on both Python 2 and 3
# (circa 2012-2013)


# Given an arbitrary piece of Python data, encode it in such a manner
# that it can be later encoded into JSON.
#   http://json.org/
#
# We use this function to encode run-time traces of data structures
# to send to the front-end.
#
# Format:
#   Primitives:
#   * None, int, long, float, str, bool - unchanged
#     (json.dumps encodes these fine verbatim, except for inf, -inf, and nan)
#
#   exceptions: float('inf')  -> ['SPECIAL_FLOAT', 'Infinity']
#               float('-inf') -> ['SPECIAL_FLOAT', '-Infinity']
#               float('nan')  -> ['SPECIAL_FLOAT', 'NaN']
#               x == int(x)   -> ['SPECIAL_FLOAT', '%.1f' % x]
#               (this way, 3.0 prints as '3.0' and not as 3, which looks like an int)
#
#   If render_heap_primitives is True, then primitive values are rendered
#   on the heap as ['HEAP_PRIMITIVE', <type name>, <value>]
#
#   (for SPECIAL_FLOAT values, <value> is a list like ['SPECIAL_FLOAT', 'Infinity'])
#
#   added on 2018-06-13:
#   ['IMPORTED_FAUX_PRIMITIVE', <label>] - renders externally imported objects
#                                          like they were primitives, to save
#                                          space and to prevent from having to
#                                          recurse into of them to see internals
#
#   Compound objects:
#   * list     - ['LIST', elt1, elt2, elt3, ..., eltN]
#   * tuple    - ['TUPLE', elt1, elt2, elt3, ..., eltN]
#   * set      - ['SET', elt1, elt2, elt3, ..., eltN]
#   * dict     - ['DICT', [key1, value1], [key2, value2], ..., [keyN, valueN]]
#   * instance - ['INSTANCE', class name, [attr1, value1], [attr2, value2], ..., [attrN, valueN]]
#   * instance with non-trivial __str__ defined - ['INSTANCE_PPRINT', class name, <__str__ value>, [attr1, value1], [attr2, value2], ..., [attrN, valueN]]
#   * class    - ['CLASS', class name, [list of superclass names], [attr1, value1], [attr2, value2], ..., [attrN, valueN]]
#   * function - ['FUNCTION', function name, parent frame ID (for nested functions),
#                 [*OPTIONAL* list of pairs of default argument names/values] ] <-- final optional element added on 2018-06-13
#   * module   - ['module', module name]
#   * other    - [<type name>, string representation of object]
#   * compound object reference - ['REF', target object's unique_id]
#
# the unique_id is derived from id(), which allows us to capture aliasing


# number of significant digits for floats
FLOAT_PRECISION = 4


from collections import defaultdict
import re, types
import sys
import math

typeRE = re.compile("<type '(.*)'>")
classRE = re.compile("<class '(.*)'>")

import inspect

# TODO: maybe use the 'six' library to smooth over Py2 and Py3 incompatibilities?
is_python3 = sys.version_info[0] == 3
if is_python3:
    # avoid name errors (GROSS!)
    long = int
    unicode = str


def is_class(dat):
    """Return whether dat is a class."""
    if is_python3:
        return isinstance(dat, type)
    else:
        return type(dat) in (types.ClassType, types.TypeType)


def is_instance(dat):
    """Return whether dat is an instance of a class."""
    if is_python3:
        return (
            type(dat) not in PRIMITIVE_TYPES
            and isinstance(type(dat), type)
            and not isinstance(dat, type)
        )
    else:
        # ugh, classRE match is a bit of a hack :(
        return type(dat) == types.InstanceType or classRE.match(str(type(dat)))


def get_name(obj):
    """Return the name of an object."""
    return obj.__name__ if hasattr(obj, "__name__") else get_name(type(obj))


PRIMITIVE_TYPES = (int, long, float, str, unicode, bool, type(None))


def encode_primitive(dat):
    t = type(dat)
    if t is float:
        if math.isinf(dat):
            if dat > 0:
                return ["SPECIAL_FLOAT", "Infinity"]
            else:
                return ["SPECIAL_FLOAT", "-Infinity"]
        elif math.isnan(dat):
            return ["SPECIAL_FLOAT", "NaN"]
        else:
            # render floats like 3.0 as '3.0' and not as 3
            if dat == int(dat):
                return ["SPECIAL_FLOAT", "%.1f" % dat]
            else:
                return round(dat, FLOAT_PRECISION)
    elif t is str and (not is_python3):
        # hack only for Python 2 strings ... always turn into unicode
        # and display '?' when it's not valid unicode
        return dat.decode("utf-8", "replace")
    else:
        # return all other primitives verbatim
        return dat


# grab a line number like ' <line 2>' or ' <line 2b>'
def create_lambda_line_number(codeobj, line_to_lambda_code):
    try:
        lambda_lineno = codeobj.co_firstlineno
        lst = line_to_lambda_code[lambda_lineno]
        ind = lst.index(codeobj)
        # add a suffix for all subsequent lambdas on a line beyond the first
        # (nix this for now because order isn't guaranteed when you have
        #  multiple lambdas on the same line)
        """
    if ind > 0:
      lineno_str = str(lambda_lineno) + chr(ord('a') + ind)
    else:
      lineno_str = str(lambda_lineno)
    """
        lineno_str = str(lambda_lineno)
        return " <line " + lineno_str + ">"
    except:
        return ""


# Note that this might BLOAT MEMORY CONSUMPTION since we're holding on
# to every reference ever created by the program without ever releasing
# anything!
class ObjectEncoder:
    def __init__(self, parent):
        self.parent = parent  # should be a PGLogger object

        # Key: canonicalized small ID
        # Value: encoded (compound) heap object
        self.encoded_heap_objects = {}

        self.render_heap_primitives = parent.render_heap_primitives

        self.id_to_small_IDs = {}
        self.cur_small_ID = 1

        # wow, creating unique identifiers for lambdas is quite annoying,
        # especially if we want to properly differentiate:
        # 1.) multiple lambdas defined on the same line, and
        # 2.) the same lambda code defined multiple times on different lines
        #
        # However, it gets confused when there are multiple identical
        # lambdas on the same line, like:
        # f(lambda x:x*x, lambda y:y*y, lambda x:x*x)

        # (assumes everything is in one file)
        # Key:   line number
        # Value: list of the code objects of lambdas defined
        #        on that line in the order they were defined
        self.line_to_lambda_code = defaultdict(list)

    def should_hide_var(self, var):
        return self.parent.should_hide_var(var)

    # searches through self.parents.types_to_inline and tries
    # to match the type returned by type(obj).__name__ and
    # also 'class' and 'instance' for classes and instances, respectively
    def should_inline_object_by_type(self, obj):
        # fast-pass optimization -- common case
        if not self.parent.types_to_inline:
            return False

        # copy-pasted from the end of self.encode()
        typ = type(obj)
        typename = typ.__name__

        # pick up built-in functions too:
        if typ in (
            types.FunctionType,
            types.MethodType,
            types.BuiltinFunctionType,
            types.BuiltinMethodType,
        ):
            typename = "function"

        if not typename:
            return False

        alt_typename = None
        if is_class(obj):
            alt_typename = "class"
        elif is_instance(obj) and typename != "function":
            # if obj is an instance of the Fooo class, then we want to match
            # on both 'instance' and 'Fooo'
            # (exception: 'function' objects are sometimes also instances,
            #  but we still want to call them 'function', so ignore them)
            typename = "instance"
            class_name = None
            if hasattr(obj, "__class__"):
                # common case ...
                class_name = get_name(obj.__class__)
            else:
                # super special case for something like
                # "from datetime import datetime_CAPI" in Python 3.2,
                # which is some weird 'PyCapsule' type ...
                # http://docs.python.org/release/3.1.5/c-api/capsule.html
                class_name = get_name(type(obj))
            alt_typename = class_name

        for re_match in self.parent.types_to_inline:
            if re_match(typename):
                return True
            if alt_typename and re_match(alt_typename):
                return True
        return False

    def get_heap(self):
        return self.encoded_heap_objects

    def reset_heap(self):
        # VERY IMPORTANT to reassign to an empty dict rather than just
        # clearing the existing dict, since get_heap() could have been
        # called earlier to return a reference to a previous heap state
        self.encoded_heap_objects = {}

    def set_function_parent_frame_ID(self, ref_obj, enclosing_frame_id):
        assert ref_obj[0] == "REF"
        func_obj = self.encoded_heap_objects[ref_obj[1]]
        assert func_obj[0] == "FUNCTION"
        func_obj[-1] = enclosing_frame_id

    # return either a primitive object or an object reference;
    # and as a side effect, update encoded_heap_objects
    def encode(self, dat, get_parent):
        """Encode a data value DAT using the GET_PARENT function for parent ids."""
        # primitive type
        if not self.render_heap_primitives and type(dat) in PRIMITIVE_TYPES:
            return encode_primitive(dat)
        # compound type - return an object reference and update encoded_heap_objects
        else:
            # IMPORTED_FAUX_PRIMITIVE feature added on 2018-06-13:
            is_externally_defined = (
                False  # is dat defined in external (i.e., non-user) code?
            )
            try:
                # some objects don't return anything for getsourcefile() but DO return
                # something legit for getmodule(). e.g., "from io import StringIO"
                # so TRY getmodule *first* and then fall back on getsourcefile
                # since getmodule seems more robust empirically ...
                gsf = inspect.getmodule(dat).__file__
                if not gsf:
                    gsf = inspect.getsourcefile(dat)

                # a hacky heuristic is that if gsf is an absolute path, then it's likely
                # to be some library function and *not* in user-defined code
                #
                # NB: don't use os.path.isabs() since it doesn't work on some
                # python installations (e.g., on my webserver) and also adds a
                # dependency on the os module. just do a simple check:
                #
                # hacky: do other checks for strings that are indicative of files
                # that load user-written code, like 'generate_json_trace.py'
                if gsf and gsf[0] == "/" and "generate_json_trace.py" not in gsf:
                    is_externally_defined = True
            except (AttributeError, TypeError):
                pass  # fail soft
            my_id = id(dat)

            # if dat is an *real* object instance (and not some special built-in one
            # like ABCMeta, or a py3 function object), then DON'T treat it as
            # externally-defined because a user might be instantiating an *instance*
            # of an imported class in their own code, so we want to show that instance
            # in da visualization - ugh #hacky
            if (
                is_instance(dat)
                and type(dat)
                not in (
                    types.FunctionType,
                    types.MethodType,
                    types.BuiltinFunctionType,
                    types.BuiltinMethodType,
                )
                and hasattr(dat, "__class__")
                and (get_name(dat.__class__) != "ABCMeta")
            ):
                is_externally_defined = False

            # if this is an externally-defined object (i.e., from an imported
            # module, don't try to recurse into it since we don't want to see
            # the internals of imported objects; just return an
            # IMPORTED_FAUX_PRIMITIVE object and continue along on our way
            if is_externally_defined:
                label = "object"
                try:
                    label = type(dat).__name__
                    if is_class(dat):
                        label = "class"
                    elif is_instance(dat):
                        label = "object"
                except:
                    pass
                return ["IMPORTED_FAUX_PRIMITIVE", "imported " + label]  # punt early!

            # next check whether it should be inlined
            if self.should_inline_object_by_type(dat):
                label = "object"
                try:
                    label = type(dat).__name__
                    if is_class(dat):
                        class_name = get_name(dat)
                        label = class_name + " class"
                    elif is_instance(dat):
                        # a lot of copy-pasta from other parts of this file:
                        # TODO: clean up
                        class_name = None
                        if hasattr(dat, "__class__"):
                            # common case ...
                            class_name = get_name(dat.__class__)
                        else:
                            # super special case for something like
                            # "from datetime import datetime_CAPI" in Python 3.2,
                            # which is some weird 'PyCapsule' type ...
                            # http://docs.python.org/release/3.1.5/c-api/capsule.html
                            class_name = get_name(type(dat))
                        if class_name:
                            label = class_name + " instance"
                        else:
                            label = "instance"
                except:
                    pass
                return ["IMPORTED_FAUX_PRIMITIVE", label + " (hidden)"]  # punt early!

            try:
                my_small_id = self.id_to_small_IDs[my_id]
            except KeyError:
                my_small_id = self.cur_small_ID
                self.id_to_small_IDs[my_id] = self.cur_small_ID
                self.cur_small_ID += 1

            del my_id  # to prevent bugs later in this function

            ret = ["REF", my_small_id]

            # punt early if you've already encoded this object
            if my_small_id in self.encoded_heap_objects:
                return ret

            # major side-effect!
            new_obj = []
            self.encoded_heap_objects[my_small_id] = new_obj

            typ = type(dat)

            if typ == list:
                new_obj.append("LIST")
                for e in dat:
                    new_obj.append(self.encode(e, get_parent))
            elif typ == tuple:
                new_obj.append("TUPLE")
                for e in dat:
                    new_obj.append(self.encode(e, get_parent))
            elif typ == set:
                new_obj.append("SET")
                for e in dat:
                    new_obj.append(self.encode(e, get_parent))
            elif typ == dict:
                new_obj.append("DICT")
                for k, v in dat.items():
                    # don't display some built-in locals ...
                    if k not in ("__module__", "__return__", "__locals__"):
                        new_obj.append(
                            [self.encode(k, get_parent), self.encode(v, get_parent)]
                        )
            elif typ in (types.FunctionType, types.MethodType):
                if is_python3:
                    argspec = inspect.getfullargspec(dat)
                else:
                    argspec = inspect.getargspec(dat)

                printed_args = [e for e in argspec.args]

                default_arg_names_and_vals = []
                if argspec.defaults:
                    num_missing_defaults = len(printed_args) - len(argspec.defaults)
                    assert num_missing_defaults >= 0
                    # tricky tricky tricky how default positional arguments work!
                    for i in range(num_missing_defaults, len(printed_args)):
                        default_arg_names_and_vals.append(
                            (
                                printed_args[i],
                                self.encode(
                                    argspec.defaults[i - num_missing_defaults],
                                    get_parent,
                                ),
                            )
                        )

                if argspec.varargs:
                    printed_args.append("*" + argspec.varargs)

                if is_python3:
                    # kwonlyargs come before varkw
                    if argspec.kwonlyargs:
                        printed_args.extend(argspec.kwonlyargs)
                        if argspec.kwonlydefaults:
                            # iterate in order of appearance in kwonlyargs
                            for varname in argspec.kwonlyargs:
                                if varname in argspec.kwonlydefaults:
                                    val = argspec.kwonlydefaults[varname]
                                    default_arg_names_and_vals.append(
                                        (varname, self.encode(val, get_parent))
                                    )
                    if argspec.varkw:
                        printed_args.append("**" + argspec.varkw)
                else:
                    if argspec.keywords:
                        printed_args.append("**" + argspec.keywords)

                func_name = get_name(dat)

                pretty_name = func_name

                # sometimes might fail for, say, <genexpr>, so just ignore
                # failures for now ...
                try:
                    pretty_name += "(" + ", ".join(printed_args) + ")"
                except TypeError:
                    pass

                # put a line number suffix on lambdas to more uniquely identify
                # them, since they don't have names
                if func_name == "<lambda>":
                    cod = dat.__code__ if is_python3 else dat.func_code  # ugh!
                    lst = self.line_to_lambda_code[cod.co_firstlineno]
                    if cod not in lst:
                        lst.append(cod)
                    pretty_name += create_lambda_line_number(
                        cod, self.line_to_lambda_code
                    )

                encoded_val = ["FUNCTION", pretty_name, None]
                if get_parent:
                    enclosing_frame_id = get_parent(dat)
                    encoded_val[2] = enclosing_frame_id
                new_obj.extend(encoded_val)
                # OPTIONAL!!!
                if default_arg_names_and_vals:
                    new_obj.append(
                        default_arg_names_and_vals
                    )  # *append* it as a single list element

            elif typ is types.BuiltinFunctionType:
                pretty_name = get_name(dat) + "(...)"
                new_obj.extend(["FUNCTION", pretty_name, None])
            elif is_class(dat) or is_instance(dat):
                self.encode_class_or_instance(dat, new_obj)
            elif typ is types.ModuleType:
                new_obj.extend(["module", dat.__name__])
            elif typ in PRIMITIVE_TYPES:
                assert self.render_heap_primitives
                new_obj.extend(
                    ["HEAP_PRIMITIVE", type(dat).__name__, encode_primitive(dat)]
                )
            else:
                typeStr = str(typ)
                m = typeRE.match(typeStr)

                if not m:
                    m = classRE.match(typeStr)

                assert m, typ

                if is_python3:
                    encoded_dat = str(dat)
                else:
                    # ugh, for bytearray() in Python 2, str() returns
                    # non-JSON-serializable characters, so need to decode:
                    encoded_dat = str(dat).decode("utf-8", "replace")
                new_obj.extend([m.group(1), encoded_dat])

            return ret

    def encode_class_or_instance(self, dat, new_obj):
        """Encode dat as a class or instance."""
        if is_instance(dat):
            if hasattr(dat, "__class__"):
                # common case ...
                class_name = get_name(dat.__class__)
            else:
                # super special case for something like
                # "from datetime import datetime_CAPI" in Python 3.2,
                # which is some weird 'PyCapsule' type ...
                # http://docs.python.org/release/3.1.5/c-api/capsule.html
                class_name = get_name(type(dat))

            pprint_str = None
            # do you or any of your superclasses have a __str__ field? if so, pretty-print yourself!
            if hasattr(dat, "__str__"):
                try:
                    pprint_str = dat.__str__()

                    # sometimes you'll get 'trivial' pprint_str like: '<__main__.MyObj object at 0x10f465cd0>'
                    # or '<module 'collections' ...'
                    # IGNORE THOSE!!!
                    if (
                        pprint_str[0] == "<"
                        and pprint_str[-1] == ">"
                        and (" at " in pprint_str or pprint_str.startswith("<module"))
                    ):
                        pprint_str = None
                except:
                    pass

            # TODO: filter for trivial-looking pprint_str like those produced
            # by object.__str__
            if pprint_str:
                new_obj.extend(["INSTANCE_PPRINT", class_name, pprint_str])
            else:
                new_obj.extend(["INSTANCE", class_name])

            # don't traverse inside modules, or else risk EXPLODING the visualization
            if class_name == "module":
                return
        else:
            superclass_names = [e.__name__ for e in dat.__bases__ if e is not object]
            new_obj.extend(["CLASS", get_name(dat), superclass_names])

        # traverse inside of its __dict__ to grab attributes
        # (filter out useless-seeming ones, based on anecdotal observation):
        hidden = (
            "__doc__",
            "__module__",
            "__return__",
            "__dict__",
            "__locals__",
            "__weakref__",
            "__qualname__",
        )
        if hasattr(dat, "__dict__"):
            user_attrs = sorted([e for e in dat.__dict__ if e not in hidden])
        else:
            user_attrs = []

        for attr in user_attrs:
            if not self.should_hide_var(attr):
                new_obj.append(
                    [self.encode(attr, None), self.encode(dat.__dict__[attr], None)]
                )