lexer.py (4456B)
1 import re 2 3 class AtomicSymbol(): 4 def __init__(self, symbol): 5 self.raw = symbol 6 self.symbol = re.compile(symbol) 7 8 def match(self, tokenstring, index): 9 return [index + 1, [tokenstring[index]]] \ 10 if self.symbol.match(tokenstring[index]) else [False, None] 11 12 class CompoundSymbol(): 13 def __init__(self, symbols): 14 for x, i in enumerate(symbols): 15 if type(i) is str: 16 symbols[x] = AtomicSymbol(i) 17 18 self.symbols = symbols 19 20 def add(self, symbol): 21 if type(symbol) is str: 22 symbol = AtomicSymbol(symbol) 23 24 self.symbols.append(symbol) 25 26 def match(self, tokenstring, index): 27 rv = [] 28 for i in self.symbols: 29 r = i.match(tokenstring, index) 30 if r[0]: 31 rv = r[1] 32 index = r[0] 33 break 34 35 return [index, rv] if len(rv) > 0 else [False, None] 36 37 class InclusiveSymbol(CompoundSymbol): 38 def match(self, tokenstring, index): 39 rv = [] 40 for i in self.symbols: 41 r = i.match(tokenstring, index) 42 if r[0]: 43 rv = r[1] 44 index = r[0] 45 break 46 47 return [index, rv] if len(rv) > 0 else [False, None] 48 49 class ExclusiveSymbol(CompoundSymbol): 50 def match(self, tokenstring, index): 51 rv = [tokenstring[index]] 52 for i in self.symbols: 53 r = i.match(tokenstring, index) 54 if r[0]: 55 rv = [] 56 57 return [index + 1, rv] if len(rv) > 0 else [False, None] 58 59 class PolySymbol(): 60 def __init__(self, symbols, terminator=[]): 61 self.symbols = symbols 62 self.terminator = terminator 63 64 def match(self, tokenstring, index): 65 rv = [] 66 while index+1 < len(tokenstring): 67 for t in self.terminator: 68 r = t.match(tokenstring, index) 69 if r[0]: 70 break 71 v = False 72 for s in self.symbols: 73 r = s.match(tokenstring, index) 74 if r[0]: 75 rv.extend(r[1]) 76 index = r[0] 77 v = True 78 break 79 if not v: 80 break 81 82 return [index, rv] if len(rv) > 0 else [False, None] 83 84 class GroupingSymbol(PolySymbol): 85 def match(self, tokenstring, index): 86 found = False 87 rv = [] 88 r = self.symbols[0].match(tokenstring, index) 89 if r[0]: 90 index = r[0] 91 ignore = 0 92 while index < len(tokenstring): 93 r = self.symbols[0].match(tokenstring, index) 94 if r[0]: 95 ignore += 1 96 rv.extend(r[1]) 97 index = r[0] 98 else: 99 r = self.symbols[1].match(tokenstring, index) 100 if r[0]: 101 if ignore > 0: 102 ignore -= 1 103 rv.extend(r[1]) 104 index = r[0] 105 else: 106 index = r[0] 107 found = True 108 break 109 else: 110 rv.append(tokenstring[index]) 111 index += 1 112 113 return [index, rv] if found else [False, None] 114 115 class Statement(): 116 def __init__(self, name, expression=[], onMatch=None, init=None): 117 self.name = name 118 self.expr = expression 119 self.onMatch= onMatch 120 self.action = init 121 122 def match(self, tokenstring): 123 rv = [] 124 index = 0 125 for e in self.expr: 126 if index >= len(tokenstring): 127 return False 128 r = e.match(tokenstring, index) 129 130 if r[0]: 131 rv.append(r[1]) 132 index = r[0] 133 else: 134 break 135 136 matched = index == len(tokenstring) 137 138 return rv if matched else False 139 140 141 142 class Tokenizer(): 143 def __init__(self, symbol_delim, statement_delim): 144 self.symbol_delim = symbol_delim 145 self.statement_delim = statement_delim 146 147 self.symbols = [] 148 149 # Based off of self.symbol_delim, and string literals, break code into bits 150 def generate_symbols(self, raw_string): 151 tmp = "" 152 153 #Thing that keeps string literals in tact. 154 in_string = False 155 no_escape = True 156 for char in raw_string: 157 if char == "\\": 158 no_escape = False 159 if char == "\"" and no_escape: 160 if in_string: 161 tmp = tmp + char 162 self.symbols.append(tmp) 163 tmp = "" 164 in_string = False 165 else: 166 self.symbols.append(tmp) 167 tmp = "" 168 tmp = "\0" + tmp + char 169 in_string = True 170 else: 171 tmp = tmp + char 172 if char != "\\" and no_escape == False: 173 no_escape = True 174 175 self.symbols.append(tmp) 176 177 # Go and split them codes into symbols! 178 for i in self.symbol_delim: 179 tmp = [] 180 for x in self.symbols: 181 if len(x) > 0: 182 # This checks for the work the above code did 183 # It prevents string literals from being subdivided 184 if x[0] != "\0": 185 tmp.extend(re.split("({})".format(i), x)) 186 else: 187 tmp.append(x) 188 self.symbols = tmp 189 190 def generate_statements(self, raw): 191 rv = [] 192 tmp = [] 193 194 self.generate_symbols(raw) 195 196 for i in self.symbols: 197 t = i.strip() 198 if len(t) > 0: 199 tmp.append(t) 200 201 for x in self.statement_delim: 202 if x == i: 203 rv.append(tmp) 204 tmp = [] 205 206 return rv 207