/*
Author: Nathaniel Huesler
Year: 2025
Copywrite Notice: LICENSE.txt 
*/


/*

A simple MIPS assembly parser. Given a list of tokens generated by get_tokens,
parse_MIPS generates an ordered list of instructions, which require further processing
later in compilation.

*/

enum
{
	TOK_UNKNOWN , 
	TOK_IDENTIFIER , 
	TOK_POS_INTEGER , 
	TOK_NEG_INTEGER , 
	TOK_FLOAT , 
	
	TOK_OPEN_PAREN = '(' , 
	TOK_CLOSE_PAREN = ')' , 
	
	TOK_OPEN_BRACKET = '[' , 
	TOK_CLOSE_BRACKET = ']' , 
	
	TOK_OPEN_BRACES = '{' , 
	TOK_CLOSE_BRACES = '}' , 
	
	TOK_COLON = ':' , 
	TOK_SEMICOLON = ';' , 
	TOK_DOT = '.' , 
	TOK_COMMA = ',' , 
	
	TOK_QUESTION_MARK = '?' , 
	TOK_EXCLAMATION_MARK = '!' , 
	TOK_QUOTATION_MARK = '"' , 
	TOK_APOSTROPHE = '\'' , 
	TOK_PERCENT = '%' , 
	TOK_HASHTAG = '#' , 
	
	TOK_DOLLAR = '$' , 
	TOK_TILDE = '~' , 
	TOK_CARET = '^' , 
	TOK_AMPERSAND = '&' , 
	TOK_PIPE = '|' , 
	TOK_EQUAL = '=' , 
	TOK_SMALLER_THAN = '<' , 
	TOK_GREATER_THAN = '>' , 
	TOK_PLUS = '+' , 
	TOK_MINUS = '-' , 
	TOK_ASTERISK = '*' , 
	TOK_FORWARD_SLASH = '/' , 
	TOK_BACK_SLASH = '\\' , 
};

struct Token
{
	char * chars;
	u32 size;
	u32 type;
	u32 line;
	
	union
	{
		u32 u32_value;
		i32 i32_value;
		f32 f32_value;
	};
};



struct TokenCounter
{
	Token * prev_token;
	Token * curr_token;
	Token * end_token;
	u32 lock_line;
};

#define report_compiler_error(message , mode , counter , ...) report_error("line %:"message , mode , counter -> curr_token -> line , __VA_ARGS__);

struct InstrInfo
{
	char * name;
	u32 op_code;
	u32 type;
};

struct Tag
{
	char name[64];
	u32 instr_index;
};

struct UninterpretedInstr
{
	
	Token * mnemonic;
	Token * tag;
	Token * imm;
	Token * regs[3];
	
	u32 reg_count;
};


// parser

inline
u32 is_blank_space(char c)
{
	u32 result = (c == ' ' || c == '\t' || c == '\n' || c == '\r');
	return(result);
}


inline 
u32 is_code_character(char c)
{
	u32 result = ('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_');
	
	return(result);
}

inline 
u32 is_digit(char c)
{
	u32 result = ('0' <= c && c <= '9');
	return(result);
}

inline
u32 single_char_tok(Token * next_token , char * curr_char , char char_and_type)
{
	
	u32 result = 0;
	if(*curr_char == char_and_type)
	{
		next_token -> size = 1;
		next_token -> type = char_and_type;
		result = 1;
	}
	
	return(result);
}


inline 
u32 skip_empty_space(char * buffer , char * end , u32 * line)
{
	char * curr_char = buffer;
	while(curr_char < end && is_blank_space(*curr_char))
	{
		if(*curr_char == '\n') (*line)++;
		curr_char++;
	}
	
	return(curr_char-buffer);
}


void get_tokens(Token ** tokens_ptr , u32 * count_ptr , Mem * arena , char * start , char * end)
{
	Token * tokens = mem_end(*arena , Token);
	u32 capacity = mem_rem(*arena , Token);
	u32 count = 0;
	char * curr_char = start;
	u32 line = 0;
	
	while(curr_char < end)
	{
		
		curr_char += skip_empty_space(curr_char , end , &line);
		
		if(curr_char[0] == '/' && curr_char[1] == '/')
		{
			curr_char += 2;
			while(curr_char[-1] != '\n' && curr_char < end)
			{
				curr_char++;
			}
			
			line++;
		}
		
		curr_char += skip_empty_space(curr_char , end , &line);
		
		Token * next_token = tokens + count;
		next_token -> chars = curr_char;
		next_token -> line = line;
		
		if(is_code_character(*curr_char))
		{
			while(curr_char < end && (is_code_character(*curr_char) || is_digit(*curr_char)))
			{
				curr_char++;
			}
			
			next_token -> type = TOK_IDENTIFIER;
			next_token -> size = curr_char-next_token -> chars;
			
			assert(next_token -> size > 0);
		}
		
		else if(
				is_digit(curr_char[0]) || 
				((curr_char[0] == '-' || curr_char[0] == '+') && is_digit(curr_char[1])))
		{
			
			f32 sign = +1.0f;
			u32 int_part = 0;
			f32 fract_part = 0;
			
			u32 int_exp = 1;
			f32 fract_exp = 0.1;
			
			u32 has_int = 0;
			u32 has_fract = 0;
			
			if(*curr_char == '-')
			{
				sign = -1.0f;
				curr_char++;
			}
			
			else if(*curr_char == '+')
			{
				sign = +1.0f;
				curr_char++;
			}
			
			char * int_start = curr_char;
			char * int_end = curr_char;
			
			while(curr_char < end && is_digit(*curr_char))
			{
				int_end	= curr_char++;
				has_int = 1;
			}
			
			for(char * int_char = int_end; int_char >= int_start; int_char--)
			{
				assert(is_digit(*int_char));
				u32 digit = *int_char - '0';
				int_part += digit*int_exp;
				int_exp *= 10;
			}
			
			if(*curr_char == '.')
			{
				has_fract = 1;
				curr_char++;
				while(curr_char < end && is_digit(*curr_char))
				{
					u32 digit = *curr_char - '0';
					fract_part += (f32)digit*fract_exp;
					fract_exp /= 10.0f;
					curr_char++;
				}
			}
			
			if(has_int && !has_fract && sign == +1.0f)
			{
				next_token -> type = TOK_POS_INTEGER;
				next_token -> u32_value = int_part;
			}
			
			else if(has_int && !has_fract && sign == -1.0f)
			{
				next_token -> type = TOK_NEG_INTEGER;
				next_token -> i32_value = -(i32)int_part;
			}
			
			else if(has_int && has_fract)
			{
				next_token -> type = TOK_FLOAT;
				next_token -> f32_value = sign*((f32)int_part + fract_part);
			}
			
			else
			{
				assert_zero;
			}
			
			next_token -> size = curr_char-next_token -> chars;
		}
		
		else if(single_char_tok(next_token , curr_char , '(')) curr_char++;
		else if(single_char_tok(next_token , curr_char , ')')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '[')) curr_char++;
		else if(single_char_tok(next_token , curr_char , ']')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '{')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '}')) curr_char++;
		else if(single_char_tok(next_token , curr_char , ':')) curr_char++;
		else if(single_char_tok(next_token , curr_char , ';')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '.')) curr_char++;
		else if(single_char_tok(next_token , curr_char , ',')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '?')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '!')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '"')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '\'')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '%')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '#')) curr_char++;
		
		else if(single_char_tok(next_token , curr_char , '$')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '~')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '^')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '&')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '|')) curr_char++;
		
		else if(single_char_tok(next_token , curr_char , '=')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '<')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '>')) curr_char++;
		
		else if(single_char_tok(next_token , curr_char , '+')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '-')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '*')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '/')) curr_char++;
		else if(single_char_tok(next_token , curr_char , '\\')) curr_char++;
		
		else
		{
			while(curr_char < end && !is_blank_space(*curr_char))
			{
				next_token -> size++;
				curr_char++;
			}
		}
		
		count++;
		assert(count < capacity);
		
		
	}
	
	mem_push(arena , Token , sizeof(Token)*count);
	
	*tokens_ptr = tokens;
	*count_ptr = count;
}

u32 copy_token_content(char * buffer , Token * token)
{
	
	char * curr_char = buffer;
	
	curr_char += mem_copy(curr_char , token -> chars , token -> size);
	*(curr_char++) = 0;
	
	return(curr_char-buffer);
}

inline
u32 token_eql(Token * token , char * buffer)
{
	
	u32 result = 0;
	u32 size = str_size(buffer);
	if(size == token -> size)
	{
		if(mem_eql(token -> chars , buffer , size))
		{
			result = 1;
		}
	}
	
	return(result);
}

inline
void lock_line(TokenCounter * counter)
{
	counter -> lock_line = 1;
}

inline
void unlock_line(TokenCounter * counter)
{
	counter -> lock_line = 0;
}

inline
u32 blocked_by_line(TokenCounter * counter)
{
	u32 result = 0;
	Token * curr_token = counter -> curr_token;
	Token * prev_token = counter -> prev_token;
	
	if(prev_token && curr_token)
	{
		if(counter -> lock_line && curr_token -> line != prev_token -> line)
		{
			result = 1;
		}
	}
	
	return(result);
}

inline
Token * next(TokenCounter * counter)
{
	
	Token * curr_token = counter -> curr_token;
	Token * prev_token = counter -> prev_token;
	Token * end_token = counter -> end_token;
	
	if(curr_token < end_token && !blocked_by_line(counter))
	{
		prev_token = curr_token++;
	}
	
	counter -> curr_token = curr_token;
	counter -> prev_token = prev_token;
	
	return(curr_token);
}

inline
u32 peek(char * target , TokenCounter * counter)
{
	
	Token * curr_token = counter -> curr_token;
	Token * end_token = counter -> end_token;
	
	u32 target_size = str_size(target);
	u32 result = 0;
	
	if(curr_token < end_token && !blocked_by_line(counter))
	{
		if(target_size == curr_token -> size)
		{
			if(mem_eql(target , curr_token -> chars , target_size))
			{
				result = 1;
			}
		}
	}
	
	counter -> curr_token = curr_token;
	
	return(result);
}

inline
u32 peek(char target , TokenCounter * counter)
{
	
	Token * curr_token = counter -> curr_token;
	Token * end_token = counter -> end_token;
	
	u32 result = 0;
	
	if(curr_token < end_token && !blocked_by_line(counter))
	{
		if(curr_token -> type == (u8)target)
		{
			result = 1;
		}
	}
	
	return(result);
}




inline
u32 match(char * target , TokenCounter * counter)
{
	
	Token * prev_token = counter -> prev_token;
	Token * curr_token = counter -> curr_token;
	Token * end_token = counter -> end_token;
	
	u32 target_size = str_size(target);
	u32 result = 0;
	
	if(curr_token < end_token && !blocked_by_line(counter))
	{
		if(target_size == curr_token -> size)
		{
			if(mem_eql(target , curr_token -> chars , target_size))
			{
				prev_token = curr_token++;
				result = 1;
			}
		}
	}
	
	counter -> curr_token = curr_token;
	counter -> prev_token = prev_token;
	
	return(result);
}


inline
u32 match(char target , TokenCounter * counter)
{
	
	Token * prev_token = counter -> prev_token;
	Token * curr_token = counter -> curr_token;
	Token * end_token = counter -> end_token;
	u32 result = 0;
	
	if(curr_token < end_token && !blocked_by_line(counter))
	{
		if(curr_token -> type == (u8)target)
		{
			prev_token = curr_token++;
			result = 1;
		}
	}
	
	counter -> curr_token = curr_token;
	counter -> prev_token = prev_token;
	
	return(result);
}

void parse_MIPS(
				Token * tokens , u32 token_count , 
				UninterpretedInstr * instrs , u32 * instr_count_ptr , u32 instr_capacity , 
				Mem * arena)
{
	
	u32 instr_count = *instr_count_ptr;
	
	TokenCounter counter_ = {};
	TokenCounter * counter = &counter_;
	counter -> curr_token = tokens;
	counter -> end_token = tokens + token_count;
	
	while(counter -> curr_token < counter -> end_token)
	{
		
		u32 skip_line = 0;
		u32 instr_line = counter -> curr_token -> line;
		
		assert(instr_count < instr_capacity);
		UninterpretedInstr * next_instr = instrs + instr_count;
		*next_instr = {};
		
		if(match(TOK_IDENTIFIER , counter))
		{
			
			Token * mnemonic_token = counter -> prev_token;
			if(match(':' , counter))
			{
				next_instr -> tag = mnemonic_token;
			}
			
			else
			{
				
				next_instr -> mnemonic = mnemonic_token;
				while(counter -> curr_token -> line == instr_line)
				{
					lock_line(counter);
					if(match('$' , counter))
					{
						if(match(TOK_IDENTIFIER , counter))
						{
							assert(next_instr -> reg_count < array_size(next_instr -> regs));
							next_instr -> regs[next_instr -> reg_count++] = counter -> prev_token;
						}
						
						else
						{
							report_compiler_error("expected register name after $" , 1 , counter);
							skip_line = 1;
						}
					}
					
					else if(peek(TOK_POS_INTEGER , counter) || match(TOK_NEG_INTEGER , counter))
					{
						next(counter);
						next_instr -> imm = counter -> prev_token;
					}
					
					else if(match(TOK_IDENTIFIER , counter))
					{
						next_instr -> tag = counter -> prev_token;
					}
					
					else
					{
						report_compiler_error("unknown language contruct" , 1 , counter);
						skip_line = 1;
					}
					
					unlock_line(counter);
				}
			}
			instr_count++;
		}
		
		else
		{
			report_compiler_error("expected mnemonic" , 1 , counter);
			skip_line = 1;
		}
		
		if(skip_line)
		{
			while(counter -> curr_token -> line == instr_line && counter -> curr_token < counter -> end_token)
			{
				next(counter);
			}
		}
	}
	
	*instr_count_ptr = instr_count;
}


void print_tokens(Token * tokens , u32 count)
{
	
	char buffer[128] = {};
	char token_content[128] = {};
	
	for(u32 i = 0; i < count; i++)
	{
		
		Token * curr_token = tokens + i;
		token_content[mem_copy(token_content , curr_token -> chars , curr_token -> size)] = 0;
		str_format(buffer , "[%]\n" , token_content);
		
		
		printf("%s" , buffer);	
	}
}