Goose  Artifact [f1cd855a22]

Artifact f1cd855a22b47adb74006c98dfeb036df127c1ec775888627f0ca91993d7f76d:

  • File bs/codegen/mangle.cpp — part of check-in [967d3ba3d7] at 2021-09-16 19:00:02 on branch trunk — More work on the g0 EIR api (user: achavasse size: 8317)

#include "codegen.h"

using namespace goose;
using namespace goose::eir;

// Mangling scheme:
//
// Functions, types and everything are uniquely identified by IR expressions
// (built out of eir::Term objects) called Identities.
// The mangling scheme is therefore a lossless text serialization of such an identity,
// with a simple compression scheme to make it shorter.
//
// All integers serialized as part of this scheme are in hex.
//
// Each element is introduced by one character. Hex digits (0-9, a-f) can't be used as
// intro characters.
//
// Literal elements:
//   i: integer, followed by its value
//   S: string, followed by its length, followed by ':', followed by the string itself.
//   n: anonymous StringId, followed by its unique id
//   s: StringId, followed by its length, followed by ':', followed by the string itself.
//   _ followed by an hex number: vector, whose length is that number,
//     followed by its contained elements, serialized recursively.
//   @ same as above, but the vector is of variable length and is followed by the repetition element.
//   p, P: a void* or ptr< void > term (unfortunately they have legitimate uses in some type identitiers
//      that we do need to be able to mangle)
//   x: an eir term placeholder.
//
// $: compressed string id: it is followed by the 0 based index of a previously encountered
//   StringId literal, in the order they were encountered.
//
// The letters V, o, v, t, T, q, u, r, R, n, l:
//   respectively the StringIDs: Value, Constant, void, type, rt_type,
//   quote, func, param, predicates, anykind, bool.
//   (those are super recurrent StringIDs in every function identity, so they have dedicated
//   symbols to make things shorter. Note that none of those can be the letter a to f)
//
// Some types of Terms (wildcards such as AnyTerm, opaque pointer to c++ objects) are not meant
// to appear in an identity expression and will cause mangling to fail.

// TODO: this mangling scheme is ok at this stage of development as it does perform the only thing
// we really need at the moment, which is to make sure that function with distinct identities have a
// distinct mangling.
// However too much unrelated stuff ends up in the mangling, which is bad. It tends to expose
// the internals of type implementation a bit too much - changes that should be irrelevant to binary
// compatibility might break it.

// This should be replaced with a pattern based rule system, much like the type checking rules,
// so that we can still have this as default but a specific handling of higher level constructs
// such as values (to remove the location id), function types (to get rid of the param pattern),
// etc.

// Alternatively maybe we should be more clever when building identities and avoid including those
// unecessary parts. This would require more intelligence to build an identity, although it might
// not necessarily be more complicated than the rule system idea above.

namespace
{
    class Mangler
    {
        public:
            Mangler()
            {
                m_mangled << hex;
            }

            bool mangle( const Term& t )
            {
                return visit( [&]( auto&& t )
                {
                    return mangle( t );
                }, t );
            }

            bool mangle( uint32_t integer )
            {
                m_mangled << 'i' << integer;
                return true;
            }

            // Locations shouldn't be included in the mangling, but
            // should not cause it to fail either.
            bool mangle( LocationId loc )
            {
                return true;
            }

            bool mangle( const string& str )
            {
                m_mangled << 'S' << str.size() << ':' << str;
                return true;
            }

            bool mangle( const ptr< void >& pv )
            {
                m_mangled << 'P';
                return true;
            }

            bool mangle( void* pv )
            {
                m_mangled << 'p';
                return true;
            }

            bool mangle( StringId strId )
            {
                // Some hard coded abbreviations for super recurrent stuff that ends up being present every time.
                // Not strictly necessary, but makes for shorter symbols.
                if( strId == "value"_sid )
                {
                    m_mangled << 'V';
                    return true;
                }

                if( strId == "constant"_sid )
                {
                    // Use o because we can't use a letter that also counts as an hex digit
                    // if we want to be able to unmangle it
                    m_mangled << 'o';
                    return true;
                }

                if( strId == "void"_sid )
                {
                    m_mangled << 'v';
                    return true;
                }

                if( strId == "type"_sid )
                {
                    m_mangled << 't';
                    return true;
                }

                if( strId == "rt_type"_sid )
                {
                    m_mangled << 'T';
                    return true;
                }

                if( strId == "quote"_sid )
                {
                    m_mangled << 'q';
                    return true;
                }

                if( strId == "func"_sid )
                {
                    m_mangled << 'u';
                    return true;
                }

                if( strId == "param"_sid )
                {
                    m_mangled << 'r';
                    return true;
                }

                if( strId == "predicates"_sid )
                {
                    m_mangled << 'R';
                    return true;
                }

                if( strId == "anykind"_sid )
                {
                    m_mangled << 'n';
                    return true;
                }

                if( strId == "bool"_sid )
                {
                    m_mangled << 'l';
                    return true;
                }

                // Generic compression scheme to avoid storing the same stringId
                // literals multiple times.
                if( auto it = m_dictionary.find( strId ); it != m_dictionary.end() )
                {
                    m_mangled << '$' << it->second;
                    return true;
                }

                m_dictionary.emplace( strId, m_dictionary.size() );
                m_mangled << 's' << strId.str().size();

                if( strId.isNumerical() )
                    m_mangled << '#' << strId.id() << '#';
                else
                    m_mangled << ':' << strId.str();

                return true;
            }

            bool mangle( const pvec& vec )
            {
                if( vec->repetitionTerm() )
                    m_mangled << '@';
                else
                    m_mangled << '_';

                m_mangled << vec->terms().size();

                for( auto&& t : vec->terms() )
                {
                    if( !mangle( t ) )
                        return false;
                }

                if( vec->repetitionTerm() && !mangle( *vec->repetitionTerm() ) )
                    return false;

                return true;
            }

            bool mangle( const AnyTerm& a )
            {
                m_mangled << 'x';
                return true;
            }

            bool mangle( const Hole& h )
            {
                m_mangled << 'h';
                mangle( h.name() );
                mangle( h.kind() );
                return true;
            }

            template< typename T >
            bool mangle( const T& )
            {
                return false;
            }

            auto mangled() const
            {
                return m_mangled.str();
            }

        private:
            stringstream m_mangled;
            unordered_map< StringId, size_t > m_dictionary;
    };
}

namespace goose::codegen
{
    optional< string > Mangle( const Term& identity )
    {
        Mangler m;
        if( !m.mangle( identity ) )
            return nullopt;

        return m.mangled();
    }
}