language

some fools attempt at an interpreted language
Log | Files | Refs | README

lexer.py (4456B)


      1 import re
      2 
      3 class AtomicSymbol():
      4 	def __init__(self, symbol):
      5 		self.raw = symbol
      6 		self.symbol = re.compile(symbol)
      7 	
      8 	def match(self, tokenstring, index):
      9 		return [index + 1, [tokenstring[index]]]                          \
     10 		       if self.symbol.match(tokenstring[index]) else [False, None]
     11 
     12 class CompoundSymbol():
     13 	def __init__(self, symbols):
     14 		for x, i in enumerate(symbols):
     15 			if type(i) is str:
     16 				symbols[x] = AtomicSymbol(i)
     17 
     18 		self.symbols = symbols
     19 
     20 	def add(self, symbol):
     21 		if type(symbol) is str:
     22 			symbol = AtomicSymbol(symbol)
     23 
     24 		self.symbols.append(symbol)
     25 
     26 	def match(self, tokenstring, index):
     27 		rv = []
     28 		for i in self.symbols:
     29 			r = i.match(tokenstring, index)
     30 			if r[0]:
     31 				rv = r[1]
     32 				index = r[0]
     33 				break
     34 
     35 		return [index, rv] if len(rv) > 0 else [False, None]
     36 
     37 class InclusiveSymbol(CompoundSymbol):
     38 	def match(self, tokenstring, index):
     39 		rv = []
     40 		for i in self.symbols:
     41 			r = i.match(tokenstring, index)
     42 			if r[0]:
     43 				rv = r[1]
     44 				index = r[0]
     45 				break
     46 
     47 		return [index, rv] if len(rv) > 0 else [False, None]
     48 
     49 class ExclusiveSymbol(CompoundSymbol):
     50 	def match(self, tokenstring, index):
     51 		rv = [tokenstring[index]]
     52 		for i in self.symbols:
     53 			r = i.match(tokenstring, index)
     54 			if r[0]:
     55 				rv = []
     56 
     57 		return [index + 1, rv] if len(rv) > 0 else [False, None]
     58 
     59 class PolySymbol():
     60 	def __init__(self, symbols, terminator=[]):
     61 		self.symbols = symbols
     62 		self.terminator = terminator
     63 	
     64 	def match(self, tokenstring, index):
     65 		rv = []
     66 		while index+1 < len(tokenstring):
     67 			for t in self.terminator:
     68 				r = t.match(tokenstring, index)
     69 				if r[0]:
     70 					break
     71 			v = False
     72 			for s in self.symbols:
     73 				r = s.match(tokenstring, index)
     74 				if r[0]:
     75 					rv.extend(r[1])
     76 					index = r[0]
     77 					v = True
     78 					break
     79 			if not v:
     80 				break
     81 			
     82 		return [index, rv] if len(rv) > 0 else [False, None]
     83 
     84 class GroupingSymbol(PolySymbol):
     85 	def match(self, tokenstring, index):
     86 		found = False
     87 		rv = []
     88 		r = self.symbols[0].match(tokenstring, index)
     89 		if r[0]:
     90 			index = r[0]
     91 			ignore = 0
     92 			while index < len(tokenstring):
     93 				r = self.symbols[0].match(tokenstring, index)
     94 				if r[0]:
     95 					ignore += 1
     96 					rv.extend(r[1])
     97 					index = r[0]
     98 				else:
     99 					r = self.symbols[1].match(tokenstring, index)
    100 					if r[0]:
    101 						if ignore > 0:
    102 							ignore -= 1
    103 							rv.extend(r[1])
    104 							index = r[0]
    105 						else:
    106 							index = r[0]
    107 							found = True
    108 							break
    109 					else:
    110 						rv.append(tokenstring[index])
    111 						index += 1
    112 
    113 		return [index, rv] if found else [False, None]
    114 
    115 class Statement():
    116 	def __init__(self, name, expression=[], onMatch=None, init=None):
    117 		self.name   = name
    118 		self.expr   = expression
    119 		self.onMatch= onMatch
    120 		self.action = init
    121 	
    122 	def match(self, tokenstring):
    123 		rv = []
    124 		index = 0
    125 		for e in self.expr:
    126 			if index >= len(tokenstring):
    127 				return False
    128 			r = e.match(tokenstring, index)
    129 
    130 			if r[0]:
    131 				rv.append(r[1])
    132 				index = r[0]
    133 			else:
    134 				break
    135 
    136 		matched = index == len(tokenstring)
    137 
    138 		return rv if matched else False
    139 
    140 
    141 
    142 class Tokenizer():
    143 	def __init__(self, symbol_delim, statement_delim):
    144 		self.symbol_delim    = symbol_delim
    145 		self.statement_delim = statement_delim
    146 
    147 		self.symbols = []
    148 
    149 	# Based off of self.symbol_delim, and string literals, break code into bits
    150 	def generate_symbols(self, raw_string):
    151 		tmp = ""
    152 
    153 		#Thing that keeps string literals in tact.
    154 		in_string = False
    155 		no_escape = True
    156 		for char in raw_string:
    157 			if char == "\\":
    158 				no_escape = False
    159 			if char == "\"" and no_escape:
    160 				if in_string:
    161 					tmp = tmp + char
    162 					self.symbols.append(tmp)
    163 					tmp = ""
    164 					in_string = False
    165 				else:
    166 					self.symbols.append(tmp)
    167 					tmp = ""
    168 					tmp = "\0" + tmp + char
    169 					in_string = True
    170 			else:
    171 				tmp = tmp + char
    172 				if char != "\\" and no_escape == False:
    173 					no_escape = True
    174 
    175 		self.symbols.append(tmp)
    176 
    177 		# Go and split them codes into symbols!
    178 		for i in self.symbol_delim:
    179 			tmp = []
    180 			for x in self.symbols:
    181 				if len(x) > 0:
    182 					# This checks for the work the above code did
    183 					# It prevents string literals from being subdivided
    184 					if x[0] != "\0":
    185 						tmp.extend(re.split("({})".format(i), x))
    186 					else:
    187 						tmp.append(x)
    188 			self.symbols = tmp
    189 
    190 	def generate_statements(self, raw):
    191 		rv = []
    192 		tmp = []
    193 
    194 		self.generate_symbols(raw)
    195 
    196 		for i in self.symbols:
    197 			t = i.strip()
    198 			if len(t) > 0:
    199 				tmp.append(t)
    200 	
    201 			for x in self.statement_delim:
    202 				if x == i:
    203 					rv.append(tmp)
    204 					tmp = []
    205 
    206 		return rv
    207