language

Some fools attempt at an interpreted language
Log | Files | Refs

commit 2ca2d45b1126d5cfc5052e5e186b379b7bce4ef6
parent 74be65f0a0d7b7fd37ee4aae722320df5a2fd624
Author: Paul Longtine <paullongtine@gmail.com>
Date:   Thu Mar 31 14:15:34 2016

Added lexer prototype and commented some

Diffstat:
 src/lexer_prototype/lexer.py              | 328 +++++++++++++++++++++++++++++++-
 src/lexer_prototype/test_files/simple.ti  |   4 +-
 src/lexer_prototype/test_files/testing.ti |  20 ++-
 src/vm/inc/bc.h                           |  18 +-
 src/vm/inc/ins_def.h                      |   4 +-
 src/vm/inc/pc.h                           |  48 +++--
 src/vm/src/fh.c                           |   6 +-
 src/vm/src/ins_def.c                      |  58 ++++-
 src/vm/src/pc.c                           |  48 +++--
 9 files changed, 484 insertions(+), 50 deletions(-)

diff --git a/src/lexer_prototype/lexer.py b/src/lexer_prototype/lexer.py @@ -0,0 +1,328 @@ +import re + +class AtomicSymbol(): + def __init__(self, symbol): + self.symbol = re.compile(symbol) + + def match(self, tokenstring, index): + return [index + 1, [tokenstring[index]]] \ + if self.symbol.match(tokenstring[index]) else [False, None] + +class CompoundSymbol(): + def __init__(self, symbols): + for x, i in enumerate(symbols): + if type(i) is str: + symbols[x] = AtomicSymbol(i) + + self.symbols = symbols + + def match(self, tokenstring, index): + rv = [] + for i in self.symbols: + r = i.match(tokenstring, index) + if r[0]: + rv = r[1] + index = r[0] + break + + return [index, rv] if len(rv) > 0 else [False, None] + +class InclusiveSymbol(CompoundSymbol): + def match(self, tokenstring, index): + rv = [] + for i in self.symbols: + r = i.match(tokenstring, index) + if r[0]: + rv = r[1] + index = r[0] + break + + return [index, rv] if len(rv) > 0 else [False, None] + +class ExclusiveSymbol(CompoundSymbol): + def match(self, tokenstring, index): + rv = [tokenstring[index]] + for i in self.symbols: + r = i.match(tokenstring, index) + if r[0]: + rv = [] + + return [index + 1, rv] if len(rv) > 0 else [False, None] + +class PolySymbol(): + def __init__(self, symbols, terminator=[]): + self.symbols = symbols + self.terminator = terminator + + def match(self, tokenstring, index): + rv = [] + while index+1 < len(tokenstring): + for t in self.terminator: + r = t.match(tokenstring, index) + if r[0]: + break + v = False + for s in self.symbols: + r = s.match(tokenstring, index) + if r[0]: + rv.extend(r[1]) + index = r[0] + v = True + break + if not v: + break + + return [index, rv] if len(rv) > 0 else [False, None] + +class GroupingSymbol(PolySymbol): + def match(self, tokenstring, index): + rv = [] + r = self.symbols[0].match(tokenstring, index) + if r[0]: + rv.extend(r[1]) + index = r[0] + while index < len(tokenstring): + r = self.symbols[1].match(tokenstring, index) + if r[0]: + index = r[0] + rv.extend(r[1]) + break + else: + rv.append(tokenstring[index]) + index += 1 + + return [index, rv] if len(rv) > 0 else [False, None] + +class Statement(): + def __init__(self, name, expression=[]): + self.name = name + self.expr = expression + + def match(self, tokenstring): + rv = [] + index = 0 + for e in self.expr: + r = e.match(tokenstring, index) + if r[0]: + rv.append(r[1]) + index = r[0] + else: + break + + return rv if index == len(tokenstring) else False + +class Tokenizer(): + def __init__(self, symbol_delim, statement_delim): + self.symbol_delim = symbol_delim + self.statement_delim = statement_delim + self.symbols = [] + + # Based off of self.symbol_delim, and string literals, break code into bits + def generate_symbols(self, raw_string): + tmp = "" + + #Thing that keeps string literals in tact. + in_string = False + no_escape = True + for char in raw_string: + if char == "\\": + no_escape = False + if char == "\"" and no_escape: + if in_string: + tmp = tmp + char + self.symbols.append(tmp) + tmp = "" + in_string = False + else: + self.symbols.append(tmp) + tmp = "" + tmp = "\0" + tmp + char + in_string = True + else: + tmp = tmp + char + if char != "\\" and no_escape == False: + no_escape = True + + self.symbols.append(tmp) + + # Go and split them codes into symbols! + for i in self.symbol_delim: + tmp = [] + for x in self.symbols: + if len(x) > 0: + # This checks for the work the above code did + # It prevents string literals from being subdivided + if x[0] != "\0": + tmp.extend(re.split("({})".format(i), x)) + else: + tmp.append(x) + self.symbols = tmp + + def generate_statements(self): + rv = [] + tmp = [] + for i in self.symbols: + t = i.strip() + if len(t) > 0: + tmp.append(t) + + for x in self.statement_delim: + if x == i: + rv.append(tmp) + tmp = [] + + return rv + +def lex(file_name): + splitters = [ + ":", + ";", + "\(", + "\)", + "\[", + "\]", + "{", + "}", + ",", + " " + ] + end_statements = [ + ":", + ";", + "{", + "}" + ] + + known_tokens = [ + "if", + "for", + "func" + ] + + defined_types = [ + "int", + "float", + "array" + ] + + number_def = AtomicSymbol("[0-9]+") + + type_def = InclusiveSymbol(defined_types) + label_def = ExclusiveSymbol(defined_types + known_tokens) + + paramlist_def = GroupingSymbol( [ + AtomicSymbol("\("), + AtomicSymbol("\)") + ] ) + + expr_def = PolySymbol( [ + label_def, + number_def, + AtomicSymbol("\("), + AtomicSymbol("\)"), + AtomicSymbol("\+"), + AtomicSymbol("\-"), + AtomicSymbol("\*"), + AtomicSymbol("\/"), + AtomicSymbol("\>"), + AtomicSymbol("\<"), + AtomicSymbol("=\<"), + AtomicSymbol("\>="), + AtomicSymbol("=="), + AtomicSymbol("\""), + AtomicSymbol("'") + ], terminator=[ + AtomicSymbol(";"), + AtomicSymbol(":") + ]) + + active_tokens = [ + Statement( + "codeblock_begin", + expression=[ + AtomicSymbol("{") + ] + ), + Statement( + "codeblock_end", + expression=[ + AtomicSymbol("}") + ] + ), + Statement( + "if", + expression=[ + AtomicSymbol("if"), + expr_def, + AtomicSymbol(":") + ] + ), + Statement( + "for", + expression=[ + AtomicSymbol("for"), + expr_def, + AtomicSymbol(":") + ] + ), + Statement( + "function", + expression=[ + AtomicSymbol("func"), + label_def, + paramlist_def, + AtomicSymbol("->"), + type_def, + AtomicSymbol(":") + ] + ), + Statement( + "instantiation", + expression=[ + type_def, + label_def, + AtomicSymbol("="), + expr_def, + AtomicSymbol(";") + ] + ), + Statement( + "assignment", + expression=[ + label_def, + AtomicSymbol("="), + expr_def, + AtomicSymbol(";") + ] + ), + Statement( + "expression", + expression=[ + expr_def, + AtomicSymbol(";") + ] + ) + ] + data="" + with open(file_name, 'r') as program: + data=program.read().replace('\n', '') + + symbols = Tokenizer(splitters, end_statements) + + symbols.generate_symbols(data) + + lines = symbols.generate_statements() + rv = [] + for l in lines: + for a in active_tokens: + r = a.match(l) + if r: + rv.append((a.name,r)) + break + + return rv + +if __name__ == "__main__": + import sys + for i in lex(sys.argv[1]): + print(i) diff --git a/src/lexer_prototype/test_files/simple.ti b/src/lexer_prototype/test_files/simple.ti @@ -0,0 +1,4 @@ +func testing (int i, int x) -> int: +{ + return(0); +} diff --git a/src/lexer_prototype/test_files/testing.ti b/src/lexer_prototype/test_files/testing.ti @@ -0,0 +1,20 @@ +func some_important_function (int x, int y, int z) -> int: +{ + int c = (x * y) + z; + if c > 3: + { + c = c - 3; + } + return(c); +} + +int x = 3; + +array y = [ "Symbols suck", 32, "I know, right?" ]; + +if x == 3: +{ + print("Potatoes are good for \"the\" soul."); +} + +some_important_function(3, 4, 2); diff --git a/src/vm/inc/bc.h b/src/vm/inc/bc.h @@ -16,18 +16,18 @@ typedef unsigned int bc_addr; /* 'Bytecode Container' */ typedef struct bc_cont { - bc_addr real_addr; - byte_t op; - byte_t mdata; - byte_t adata; - byte_t* args[3]; - var_cont* varg[3]; - int sarg[3]; + bc_addr real_addr; // Real address of instruction + byte_t op; // Opcode of instruction + byte_t mdata; // Opcode metadata + byte_t adata; // Opcode arguement metadata + byte_t* args[3]; // Raw arguements + int sarg[3]; // Size of arguements + var_cont* varg[3]; // Typed arguements } bc_cont; typedef struct bc_t { - bc_addr size; - bc_cont** heap; + bc_addr size; // Size of program + bc_cont** heap; // Heap of instructions } bc_t; #include "is.h" diff --git a/src/vm/inc/ins_def.h b/src/vm/inc/ins_def.h @@ -29,6 +29,10 @@ void init_ins_def( void ); */ int ins_def_is_valid(bc_cont*); +/* Instruction subroutines. Each subroutine takes the following arguements: + * rt_t* - Runtime context + * bc_cont* - Instruction data + */ void _ins_def_NULL (rt_t*, bc_cont*); void _ins_def_SYNC (rt_t*, bc_cont*); void _ins_def_PRINT (rt_t*, bc_cont*); diff --git a/src/vm/inc/pc.h b/src/vm/inc/pc.h @@ -16,20 +16,22 @@ typedef unsigned short int pc_addr; +/* Address stack structure + */ typedef struct pc_addr_stk { - size_t size; - int ptr; - pc_addr* addresses; + int size; // Capacity of stack + int ptr; // Stack pointer + pc_addr* addresses; // Stack data } pc_addr_stk; -/* pc_t structure +/* Program counter structure */ typedef struct pc_t { - pc_addr limit; - pc_addr address; - pc_addr_stk* stk; - bc_cont* line; - bc_t* bc; + pc_addr limit; // End of program + pc_addr address; // Current address + pc_addr_stk* stk; // Address stack + bc_cont* line; // Current instruction + bc_t* bc; // Bytecode container instance } pc_t; /* Initalizes program counter, returns pc_t* instance @@ -37,39 +39,49 @@ typedef struct pc_t { */ pc_t* pc_new(char*); +/* Creates a new address stack. + * ns_addr - inital address + */ pc_addr_stk* pc_addr_stk_new(ns_addr); /* Frees memory assosiated with pc_t* instance */ void pc_del(pc_t*); -void pc_addr_stk_del(pc_addr_stk*); -/* Pushes instruction to heap +/* Frees memory assosiated with pc_addr_stk* instance */ -void pc_push(pc_t*, bc_cont*); +void pc_addr_stk_del(pc_addr_stk*); -/* Updates program counter on changes in address +/* Updates current insturction + * + * When called, pc_t->line will reflect pc_t->bc[pc_t->address] */ void pc_update(pc_t*); -/* Increment program counter by amount in pc_addr +/* Increment program counter by +-addr + * pc_t* - Program counter instance + * pc_addr - Address to increment by */ void pc_inc(pc_t*, pc_addr); -/* Branch to addr +/* Branch to specified address. + * pc_t* - Program counter instance + * pc_addr - Address to branch to */ void pc_branch(pc_t*, pc_addr); -/* Return from branch +/* Return from previous branch. + * pc_t* - Program counter instance */ void pc_return(pc_t*); /* Simply goto that address + * pc_t* - Program counter instance + * pc_addr - Address to go to */ void pc_goto(pc_t*, pc_addr); -/* For now, a simple function that returns true if the next instruction is not - * NULL. +/* For now, a simple function that tests if the address is in range */ int pc_safe(pc_t* pc); diff --git a/src/vm/src/fh.c b/src/vm/src/fh.c @@ -4,6 +4,8 @@ #include "fh.h" #include "helper.h" +/* Reads n bytes, n passed as @param long, returns array of byte_t + */ byte_t* read_bytes(FILE* f, long bytes) { byte_t* buffer = (byte_t*)malloc(bytes*sizeof(byte_t)); @@ -15,11 +17,15 @@ byte_t* read_bytes(FILE* f, long bytes) return buffer; } +/* Reads a byte, returns byte_t + */ byte_t read_byte(FILE* f) { return fgetc(f); } +/* Determines filesize, returns size as long + */ long read_size(FILE* f) { fseek(f, 0, SEEK_END); diff --git a/src/vm/src/ins_def.c b/src/vm/src/ins_def.c @@ -68,6 +68,7 @@ void init_ins_def( void ) INS_DEF[0x70] = _ins_def_GOTO; INS_DEF[0x71] = _ins_def_JUMPF; + INS_DEF[0x73] = _ins_def_ELSE; INS_DEF[0x7E] = _ins_def_DONE; INS_DEF[0x7F] = _ins_def_CALL; @@ -426,28 +427,36 @@ void _ins_def_JUMPF (rt_t* ctx, bc_cont* line) } void _ins_def_IFDO (rt_t* ctx, bc_cont* line) { + // Get the value off the stack var_cont* var = stk_pop(ctx->stack); - int value = var_data_get_G_INT(var); + // If the value is false, find an ELSE statement or DONE statement. if (value < 1) { while (pc_safe(ctx->pc)) { pc_update(ctx->pc); pc_inc(ctx->pc, 1); - + // Is the instruction an ELSE statement? if (ctx->pc->line->op == 0x73) { + // We're done here. break; } else + // Is the instruction a DONE statement? if (ctx->pc->line->op == 0x7E) { + // We're done here. break; } } } } +void _ins_def_ELSE (rt_t* ctx, bc_cont* line) +{ + pc_inc(ctx->pc, 1); +} void _ins_def_DONE (rt_t* ctx, bc_cont* line) { pc_inc(ctx->pc, 1); @@ -456,26 +465,36 @@ void _ins_def_CALL (rt_t* ctx, bc_cont* line) { int name = var_data_get_G_INT(line->varg[0]); + // Get the function's variable container var_cont* var = proc_getvar(ctx, 1, name); - var_data_func* func = var_data_get_FUNC(var); + // Push a new namespace of specified size ns_push(ctx->vars, func->size); - + // Declare the return value ns_dec(ctx->vars, func->type, 0, 0); + // Throw in the arguements on the arguement stack into the new namespace int offset = 1; int i; for (i = 0; i < func->paramlen; i++) { + // Pop the arguement stack var_cont* arg = stk_pop(ctx->argstk); + + // Is the arguement of the right type? ASSERT(arg->type == func->param[i], "Invalid function call\n"); + + // Declare the name in the new namespace and pass the arguements ns_dec(ctx->vars, arg->type, 0, i+offset); ns_set(ctx->vars, 0, i+offset, arg); } + // Push new stack levels for the stack and the arguement stack stk_newlevel(&ctx->stack); stk_newlevel(&ctx->argstk); + + // Branch to functions body pc_branch(ctx->pc, func->loc); } @@ -502,15 +521,20 @@ void _ins_def_CALLM (rt_t* ctx, bc_cont* line) void _ins_def_RETURN (rt_t* ctx, bc_cont* line) { + // Pop the namespace and get the return value var_cont* return_value = ns_pop(ctx->vars); + // Pop a level in the stack and arguement stack stk_poplevel(&ctx->stack); stk_poplevel(&ctx->argstk); + // Push the return value to the stack stk_push(ctx->stack, return_value); + // Return to the callee pc_return(ctx->pc); + // And increment the program counter pc_inc(ctx->pc, 1); } void _ins_def_NEW (rt_t* ctx, bc_cont* line) @@ -528,35 +552,49 @@ void _ins_def_DEFUN (rt_t* ctx, bc_cont* line) b_type* args = var_data_get_PLIST(line->varg[2]); size_t alen = line->sarg[2]; + // Create a new variable for the new function var_cont* func = var_new(FUNC); + // Allocate the data. var_data_func* data = var_data_alloc_FUNC(type); + // Set the location of the functions body data->loc = line->real_addr + 1; int nsize; - + /* Determine the namespace size by finding variable declarations in the functions body, + Along with determining the end of the function. + */ for (nsize = 0; pc_safe(ctx->pc); pc_update(ctx->pc)) { + // Increment the program counter pc_inc(ctx->pc, 1); + + // Is this the end? if (ctx->pc->line->op == 0xF0) { break; } else + // Are we declaring a variable? if (ctx->pc->line->op == 0x20) { + // If so, increment the namespace size. nsize++; } } - data->end = ctx->pc->line->real_addr; - data->size = nsize + alen + 1;// How many names will this function have? - data->type = type; // Return type - data->paramlen = alen; - data->param = args; + // Set all the values. + data->end = ctx->pc->line->real_addr; // This is the end! + data->size = nsize + alen + 1; // How many names will this function have? + data->type = type; // Return type + data->paramlen = alen; // Set the arguement length + data->param = args; // Set the parameter list + // Throw the data in a variable container var_set(func, data, FUNC); + // Declare a name proc_decvar(ctx, FUNC, 1, name); + // Set the name's value to the function we just defined proc_setvar(ctx, 1, name, func); } diff --git a/src/vm/src/pc.c b/src/vm/src/pc.c @@ -17,13 +17,13 @@ pc_t* pc_new(char* fname) pc_t* pc = (pc_t*)malloc(sizeof(pc_t)); M_ASSERT(pc); - + // Make a new address stack and set its inital address to 0 pc->stk = pc_addr_stk_new(0); - + // Make sure the inital address is 0 pc_branch(pc, 0); - + // Read the program specified pc->bc = bc_init(fname); - + // Update the program counter. pc_update(pc); N_ASSERT(pc->line, "Error in creating new program counter\n"); @@ -31,6 +31,9 @@ pc_t* pc_new(char* fname) return pc; } +/* Creates a new address stack. + * ns_addr - inital address + */ pc_addr_stk* pc_addr_stk_new(ns_addr address) { pc_addr_stk* new = (pc_addr_stk*)malloc(sizeof(pc_addr_stk)); @@ -41,6 +44,7 @@ pc_addr_stk* pc_addr_stk_new(ns_addr address) new->ptr = 0; new->size = PC_RETURN_DEPTH; + new->addresses[new->ptr] = address; return new; } @@ -53,12 +57,16 @@ void pc_del(pc_t* pc) N_ASSERT(pc->stk, "pc_del\n"); N_ASSERT(pc->bc, "pc_del\n"); + // Free the address stack pc_addr_stk_del(pc->stk); + // Free the program bc_del(pc->bc); free(pc); } +/* Frees memory assosiated with pc_addr_stk* instance + */ void pc_addr_stk_del(pc_addr_stk* stk) { N_ASSERT(stk, "pc_addr_stk_del\n"); @@ -68,7 +76,7 @@ void pc_addr_stk_del(pc_addr_stk* stk) free(stk); } -/* Updates currently reading line +/* Updates current insturction * * When called, pc_t->line will reflect pc_t->bc[pc_t->address] */ @@ -76,59 +84,73 @@ void pc_update(pc_t* pc) { N_ASSERT(pc, "pc_update\n"); ASSERT((pc->address <= pc->bc->size), "Uhoh\n"); + // Update the pointer pc->line = pc->bc->heap[pc->address]; } /* Increment program counter by +-addr + * pc_t* - Program counter instance + * pc_addr - Address to increment by */ void pc_inc(pc_t* pc, pc_addr addr) { N_ASSERT(pc, "pc_inc\n"); - + // Increment pc->address by addr pc->address = pc->address + addr; } -/* Branch to addr +/* Branch to specified address. + * pc_t* - Program counter instance + * pc_addr - Address to branch to */ void pc_branch(pc_t* pc, pc_addr address) { N_ASSERT(pc, "pc_branch\n"); - + // Is there space in the address stack? ASSERT(((pc->stk->ptr + 1) < pc->stk->size), "Address Stack Overflow\n"); + // Push the current address on to the stack pc->stk->addresses[pc->stk->ptr] = pc->address; + + // Increment the stack pointer pc->stk->ptr = pc->stk->ptr + 1; + // Set the current address to the branching address pc->address = address; } -/* Return from branch +/* Return from previous branch. + * pc_t* - Program counter instance */ void pc_return(pc_t* pc) { N_ASSERT(pc, "pc_return\n"); - + // Is the stack pointer in range? ASSERT(((pc->stk->ptr - 1) > 0), "Address Stack Underflow\n"); + // Decrement the stack pointer pc->stk->ptr = pc->stk->ptr - 1; + // Pop the stack to pc->address pc->address = pc->stk->addresses[pc->stk->ptr]; } /* Simply goto that address + * pc_t - Program counter instance + * pc_addr - Address to branch to */ void pc_goto(pc_t* pc, pc_addr addr) { N_ASSERT(pc, "pc_goto\n"); - + // Do I really need to comment? pc->address = addr; } -/* For now, a simple function that returns true if the next instruction is not - * NULL. +/* For now, a simple function that tests if the address is in range */ int pc_safe(pc_t* pc) { + // If the address is in range return true, if out of range return false int rv = (pc->address >= pc->bc->size) ? 0 : 1; return rv; }