post-scarcity/src/io/read.c

/*
 * read.c
 *
 * First pass at a reader, for bootstrapping.
 *
 *
 * (c) 2017 Simon Brooke <simon@journeyman.cc>
 * Licensed under GPL version 2.0, or, at your option, any later version.
 */

#include <math.h>
#include <stdbool.h>
#include <stdio.h>
/*
 * wide characters
 */
#include <wchar.h>
#include <wctype.h>

#include "memory/consspaceobject.h"
#include "debug.h"
#include "memory/dump.h"
#include "memory/hashmap.h"
#include "arith/integer.h"
#include "ops/intern.h"
#include "io/io.h"
#include "ops/lispops.h"
#include "arith/peano.h"
#include "io/print.h"
#include "arith/ratio.h"
#include "io/read.h"
#include "arith/real.h"
#include "memory/vectorspace.h"

/*
 * for the time being things which may be read are:
 * * strings
 * * numbers - either integer, ratio or real
 * * lists
 * * maps
 * * keywords
 * * atoms
 */

struct cons_pointer read_number( struct stack_frame *frame,
                                 struct cons_pointer frame_pointer,
                                 URL_FILE * input, wint_t initial,
                                 bool seen_period );
struct cons_pointer read_list( struct stack_frame *frame,
                               struct cons_pointer frame_pointer,
                               struct cons_pointer env,
                               URL_FILE * input, wint_t initial );
struct cons_pointer read_map( struct stack_frame *frame,
                              struct cons_pointer frame_pointer,
                              struct cons_pointer env,
                              URL_FILE * input, wint_t initial );
struct cons_pointer read_string( URL_FILE * input, wint_t initial );
struct cons_pointer read_symbol_or_key( URL_FILE * input, uint32_t tag,
                                        wint_t initial );

/**
 * quote reader macro in C (!)
 */
struct cons_pointer c_quote( struct cons_pointer arg ) {
    return make_cons( c_string_to_lisp_symbol( L"quote" ),
                      make_cons( arg, NIL ) );
}

/**
 * Read a path macro from the stream. A path macro is expected to be
 * 1. optionally a leading character such as '/' or '$', followed by
 * 2. one or more keywords with leading colons (':') but no intervening spaces; or
 * 3. one or more symbols separated by slashes; or
 * 4. keywords (with leading colons) interspersed with symbols (prefixed by slashes).
 */
struct cons_pointer read_path( URL_FILE * input, wint_t initial,
                               struct cons_pointer q ) {
    bool done = false;
    struct cons_pointer prefix = NIL;

    switch ( initial ) {
        case '/':
            prefix = c_string_to_lisp_symbol( L"oblist" );
            break;
        case '$':
        case LSESSION:
            prefix = c_string_to_lisp_symbol( L"session" );
            break;
    }

    while ( !done ) {
        wint_t c = url_fgetwc( input );
        if ( iswblank( c ) || iswcntrl( c ) ) {
            done = true;
        } else if ( url_feof( input ) ) {
            done = true;
        } else {
            switch ( c ) {
                case ':':
                    q = make_cons( read_symbol_or_key
                                   ( input, KEYTV, url_fgetwc( input ) ), q );
                    break;
                case '/':
                    q = make_cons( make_cons
                                   ( c_string_to_lisp_symbol( L"quote" ),
                                     make_cons( read_symbol_or_key
                                                ( input, SYMBOLTV,
                                                  url_fgetwc( input ) ),
                                                NIL ) ), q );
                    break;
                default:
                    if ( iswalpha( c ) ) {
                        q = make_cons( read_symbol_or_key
                                       ( input, SYMBOLTV, c ), q );
                    } else {
                        // TODO: it's really an error. Exception?
                        url_ungetwc( c, input );
                        done = true;
                    }
            }
        }
    }

    // right, we now have the path we want (reversed) in q.
    struct cons_pointer r = NIL;

    for ( struct cons_pointer p = q; !nilp( p ); p = c_cdr( p ) ) {
        r = make_cons( c_car( p ), r );
    }

    dec_ref( q );

    if ( !nilp( prefix ) ) {
        r = make_cons( prefix, r );
    }

    return make_cons( c_string_to_lisp_symbol( L"->" ), r );
}

/**
 * Read the next object on this input stream and return a cons_pointer to it,
 * treating this initial character as the first character of the object
 * representation.
 */
struct cons_pointer read_continuation( struct stack_frame *frame,
                                       struct cons_pointer frame_pointer,
                                       struct cons_pointer env,
                                       URL_FILE * input, wint_t initial ) {
    debug_print( L"entering read_continuation\n", DEBUG_IO );
    struct cons_pointer result = NIL;

    wint_t c;

    for ( c = initial;
          c == '\0' || iswblank( c ) || iswcntrl( c );
          c = url_fgetwc( input ) );

    if ( url_feof( input ) ) {
        result =
            throw_exception( c_string_to_lisp_string
                             ( L"End of file while reading" ), frame_pointer );
    } else {
        switch ( c ) {
            case ';':
                for ( c = url_fgetwc( input ); c != '\n';
                      c = url_fgetwc( input ) );
                /* skip all characters from semi-colon to the end of the line */
                break;
            case EOF:
                result = throw_exception( c_string_to_lisp_string
                                          ( L"End of input while reading" ),
                                          frame_pointer );
                break;
            case '\'':
                result =
                    c_quote( read_continuation
                             ( frame, frame_pointer, env, input,
                               url_fgetwc( input ) ) );
                break;
            case '(':
                result =
                    read_list( frame, frame_pointer, env, input,
                               url_fgetwc( input ) );
                break;
            case '{':
                result = read_map( frame, frame_pointer, env, input,
                                   url_fgetwc( input ) );
                break;
            case '"':
                result = read_string( input, url_fgetwc( input ) );
                break;
            case '-':{
                    wint_t next = url_fgetwc( input );
                    url_ungetwc( next, input );
                    if ( iswdigit( next ) ) {
                        result =
                            read_number( frame, frame_pointer, input, c,
                                         false );
                    } else {
                        result = read_symbol_or_key( input, SYMBOLTV, c );
                    }
                }
                break;
            case '.':
                {
                    wint_t next = url_fgetwc( input );
                    if ( iswdigit( next ) ) {
                        url_ungetwc( next, input );
                        result =
                            read_number( frame, frame_pointer, input, c,
                                         true );
                    } else if ( iswblank( next ) ) {
                        /* dotted pair. \todo this isn't right, we
                         * really need to backtrack up a level. */
                        result =
                            read_continuation( frame, frame_pointer, env,
                                               input, url_fgetwc( input ) );
                        debug_print
                            ( L"read_continuation: dotted pair; read cdr ",
                              DEBUG_IO );
                    } else {
                        read_symbol_or_key( input, SYMBOLTV, c );
                    }
                }
                break;
            case ':':
                result =
                    read_symbol_or_key( input, KEYTV, url_fgetwc( input ) );
                break;
            case '/':
                {
                    /* slash followed by whitespace is legit provided it's not
                     * preceded by anything - it's the division operator. Otherwise,
                     * it's terminal, probably part of a path, and needs pushed back.
                     */
                    wint_t cn = url_fgetwc( input );
                    if ( nilp( result )
                         && ( iswblank( cn ) || iswcntrl( cn ) ) ) {
                        url_ungetwc( cn, input );
                        result = make_symbol_or_key( c, NIL, SYMBOLTV );
                    } else {
                        url_ungetwc( cn, input );
                        result = read_path( input, c, NIL );
                    }
                }
                break;
            case '$':
            case LSESSION:
                result = read_path( input, c, NIL );
                break;
            default:
                if ( iswdigit( c ) ) {
                    result =
                        read_number( frame, frame_pointer, input, c, false );
                } else if ( iswprint( c ) ) {
                    result = read_symbol_or_key( input, SYMBOLTV, c );
                } else {
                    result =
                        throw_exception( make_cons( c_string_to_lisp_string
                                                    ( L"Unrecognised start of input character" ),
                                                    make_string( c, NIL ) ),
                                         frame_pointer );
                }
                break;
        }
    }
    debug_print( L"read_continuation returning\n", DEBUG_IO );
    debug_dump_object( result, DEBUG_IO );

    return result;
}

/**
 * read a number from this input stream, given this initial character.
 * \todo Need to do a lot of inc_ref and dec_ref, to make sure the
 * garbage is collected.
 */
struct cons_pointer read_number( struct stack_frame *frame,
                                 struct cons_pointer frame_pointer,
                                 URL_FILE * input,
                                 wint_t initial, bool seen_period ) {
    debug_print( L"entering read_number\n", DEBUG_IO );

    struct cons_pointer result = make_integer( 0, NIL );
    /* \todo we really need to be getting `base` from a privileged Lisp name -
     * and it should be the same privileged name we use when writing numbers */
    struct cons_pointer base = make_integer( 10, NIL );
    struct cons_pointer dividend = NIL;
    int places_of_decimals = 0;
    wint_t c;
    bool neg = initial == btowc( '-' );

    if ( neg ) {
        initial = url_fgetwc( input );
    }

    debug_printf( DEBUG_IO, L"read_number starting '%c' (%d)\n", initial,
                  initial );

    for ( c = initial; iswdigit( c )
          || c == LPERIOD || c == LSLASH || c == LCOMMA; c = url_fgetwc( input ) ) {
        switch ( c ) {
            case LPERIOD:
                if ( seen_period || !nilp( dividend ) ) {
                    return throw_exception( c_string_to_lisp_string
                                            ( L"Malformed number: too many periods" ),
                                            frame_pointer );
                } else {
                    debug_print( L"read_number: decimal point seen\n",
                                 DEBUG_IO );
                    seen_period = true;
                }
                break;
            case LSLASH:
                if ( seen_period || !nilp( dividend ) ) {
                    return throw_exception( c_string_to_lisp_string
                                            ( L"Malformed number: dividend of rational must be integer" ),
                                            frame_pointer );
                } else {
                    debug_print( L"read_number: ratio slash seen\n",
                                 DEBUG_IO );
                    dividend = result;

                    result = make_integer( 0, NIL );
                }
                break;
            case LCOMMA:
                // silently ignore it.
                break;
            default:
                result = add_integers( multiply_integers( result, base ),
                                        /* /todo: this won't work for hex digits */
                                       make_integer( ( int ) c - ( int ) '0',
                                                     NIL ) );

                debug_printf( DEBUG_IO,
                              L"read_number: added character %c, result now ",
                              c );
                debug_print_object( result, DEBUG_IO );
                debug_print( L"\n", DEBUG_IO );

                if ( seen_period ) {
                    places_of_decimals++;
                }
        }
    }

    /*
     * push back the character read which was not a digit
     */
    url_ungetwc( c, input );

    if ( seen_period ) {
        debug_print( L"read_number: converting result to real\n", DEBUG_IO );
        struct cons_pointer div = make_ratio( result,
                                              make_integer( powl
                                                            ( to_long_double
                                                              ( base ),
                                                              places_of_decimals ),
                                                            NIL ) );
        inc_ref( div );

        result = make_real( to_long_double( div ) );

        dec_ref( div );
    } else if ( integerp( dividend ) ) {
        debug_print( L"read_number: converting result to ratio\n", DEBUG_IO );
        result = make_ratio( dividend, result );
    }

    if ( neg ) {
        debug_print( L"read_number: converting result to negative\n",
                     DEBUG_IO );

        result = negative( result );
    }

    debug_print( L"read_number returning\n", DEBUG_IO );
    debug_dump_object( result, DEBUG_IO );

    return result;
}

/**
 * Read a list from this input stream, which no longer contains the opening
 * left parenthesis.
 */
struct cons_pointer read_list( struct stack_frame *frame,
                               struct cons_pointer frame_pointer,
                               struct cons_pointer env,
                               URL_FILE * input, wint_t initial ) {
    struct cons_pointer result = NIL;
    wint_t c;

    if ( initial != ')' ) {
        debug_printf( DEBUG_IO,
                      L"read_list starting '%C' (%d)\n", initial, initial );
        struct cons_pointer car =
            read_continuation( frame, frame_pointer, env, input,
                               initial );

        /* skip whitespace */
        for ( c = url_fgetwc( input );
              iswblank( c ) || iswcntrl( c ); c = url_fgetwc( input ) );

        if ( c == LPERIOD ) {
            /* might be a dotted pair; indeed, if we rule out numbers with
             * initial periods, it must be a dotted pair. \todo Ought to check,
             * howerver, that there's only one form after the period. */
            result =
                make_cons( car,
                           c_car( read_list( frame,
                                             frame_pointer,
                                             env,
                                             input, url_fgetwc( input ) ) ) );
        } else {
            result =
                make_cons( car,
                           read_list( frame, frame_pointer, env, input, c ) );
        }
    } else {
        debug_print( L"End of list detected\n", DEBUG_IO );
    }

    return result;
}

struct cons_pointer read_map( struct stack_frame *frame,
                              struct cons_pointer frame_pointer,
                              struct cons_pointer env,
                              URL_FILE * input, wint_t initial ) {
    // set write ACL to true whilst creating to prevent GC churn
    struct cons_pointer result =
        make_hashmap( DFLT_HASHMAP_BUCKETS, NIL, TRUE );
    wint_t c = initial;

    while ( c != LCBRACE ) {
        struct cons_pointer key =
            read_continuation( frame, frame_pointer, env, input, c );

        /* skip whitespace */
        for ( c = url_fgetwc( input ); iswblank( c ) || iswcntrl( c );
              c = url_fgetwc( input ) );

        struct cons_pointer value =
            read_continuation( frame, frame_pointer, env, input, c );

        /* skip commaa and whitespace at this point. */
        for ( c = url_fgetwc( input );
              c == LCOMMA || iswblank( c ) || iswcntrl( c );
              c = url_fgetwc( input ) );

        result =
            hashmap_put( result, key,
                         eval_form( frame, frame_pointer, value, env ) );
    }

    // default write ACL for maps should be NIL.
    pointer_to_vso( result )->payload.hashmap.write_acl = NIL;

    return result;
}

/**
 * Read a string. This means either a string delimited by double quotes
 * (is_quoted == true), in which case it may contain whitespace but may
 * not contain a double quote character (unless escaped), or one not
 * so delimited in which case it may not contain whitespace (unless escaped)
 * but may contain a double quote character (probably not a good idea!)
 */
struct cons_pointer read_string( URL_FILE * input, wint_t initial ) {
    struct cons_pointer cdr = NIL;
    struct cons_pointer result;
    switch ( initial ) {
        case '\0':
            result = NIL;
            break;
        case '"':
            /* making a string of the null character means we can have an empty
             * string. Just returning NIL here would make an empty string
             * impossible. */
            result = make_string( '\0', NIL );
            break;
        default:
            result =
                make_string( initial,
                             read_string( input, url_fgetwc( input ) ) );
            break;
    }

    return result;
}

struct cons_pointer read_symbol_or_key( URL_FILE * input, uint32_t tag,
                                        wint_t initial ) {
    struct cons_pointer cdr = NIL;
    struct cons_pointer result;
    switch ( initial ) {
        case '\0':
            result = make_symbol_or_key( initial, NIL, tag );
            break;
        case '"':
        case '\'':
            /* unwise to allow embedded quotation marks in symbols */
        case ')':
        case ':':
        case '/':
            /*
             * symbols and keywords may not include right-parenthesis,
             * slashes or colons.
             */
            result = NIL;
            /*
             * push back the character read
             */
            url_ungetwc( initial, input );
            break;
        default:
            if ( iswprint( initial )
                 && !iswblank( initial ) ) {
                result =
                    make_symbol_or_key( initial,
                                        read_symbol_or_key( input,
                                                            tag,
                                                            url_fgetwc
                                                            ( input ) ), tag );
            } else {
                result = NIL;
                /*
                 * push back the character read
                 */
                url_ungetwc( initial, input );
            }
            break;
    }

    debug_print( L"read_symbol_or_key returning\n", DEBUG_IO );
    debug_dump_object( result, DEBUG_IO );

    return result;
}

/**
 * Read the next object on this input stream and return a cons_pointer to it.
 */
struct cons_pointer read( struct
                          stack_frame
                          *frame, struct cons_pointer frame_pointer,
                          struct cons_pointer env, URL_FILE * input ) {
    return read_continuation( frame, frame_pointer, env, input,
                              url_fgetwc( input ) );
}