Radicalc  Artifact [cd1b2210f7]

Artifact cd1b2210f70294845c1b9fec34ffb2fba4f2edd705724d0f36ddde921a936cef:

  • File bootstrap/main.c — part of check-in [fd180a44da] at 2017-07-13 00:33:28 on branch trunk — mangled names (user: athaudia size: 14252)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>


/******************************************************************************/
/*                                                                            */
/*   TOKENISER                                                                */
/*                                                                            */
/******************************************************************************/

enum token_type {
	TOK_NONE,
	TOK_PAREN_OPEN, TOK_PAREN_CLOSE, TOK_COMMA, TOK_COLON, 
	TOK_NEWLINE, TOK_EOF,
	TOK_FUN, TOK_END,
	TOK_IDENT, TOK_LITERAL_INT
};

struct token {
	enum token_type type;
	char* str;
};

int ch;
char* buf;
int buf_len;

struct token* tokens;
int token_len;

int isidentch(int c) {
	return isalnum(c) || c == '_';
}

void next_ch() {
	ch = getchar();
}

void accept_ch() {
	buf = realloc(buf, ++buf_len);
	buf[buf_len-1] = (char)ch;	
}

void skip_ws() {
	//printf("attempting to skip whitespace, next (%d)\n", ch);
	while(ch == ' ' || ch == '\t') {
		//printf("skipping ws: (%d)\n", ch);
		next_ch();
	}
}

void finalise_buf() {
	buf = realloc(buf, ++buf_len);
	buf[buf_len-1] = 0;
}

void reset_buf() { /* warning, memory leaking by design */
	buf = 0;
	buf_len = 0;
}

void add_token(enum token_type type, char* str) {
	tokens = realloc(tokens, sizeof(struct token) * (++token_len));
	tokens[token_len - 1].type = type;
	tokens[token_len - 1].str = str;
}

void tokenise() {
	enum token_type mode = TOK_NONE;
	reset_buf();
	tokens = 0;
	token_len = 0;
	next_ch();
	do {
		printf(">> %c <<\n", ch);
		switch(mode) {
			case TOK_NONE:
				if(ch == '(') {
					add_token(TOK_PAREN_OPEN, 0);
					next_ch();
					skip_ws();
				}
				else if(ch == ')') {
					add_token(TOK_PAREN_CLOSE, 0);
					next_ch();
					skip_ws();
				}
				else if(ch == ',') {
					add_token(TOK_COMMA, 0);
					next_ch();
					skip_ws();
				}
				else if(ch == ':') {
					add_token(TOK_COLON, 0);
					next_ch();
					skip_ws();
				}
				else if(ch == '\n' || ch == '\r') {
					add_token(TOK_NEWLINE, 0);
					next_ch();
					/* collapse all versions, and multiples to a single token */
					while(ch == '\n' || ch == '\r') next_ch();
					skip_ws();
				}
				else if(isalpha(ch)) {
					mode = TOK_IDENT;
					accept_ch();
					next_ch();
				}
				else if(isdigit(ch)) {
					mode = TOK_LITERAL_INT;
					accept_ch();
					next_ch();
				}
				else {
					printf("ERROR: unexpected character '%c' (%d)\n", ch, ch);
					return;
				}
				break;
			case TOK_IDENT:
				if(isidentch(ch)) {
					accept_ch();
					next_ch();
				}
				else {
					finalise_buf();
					if(strcmp(buf, "fun") == 0)
						add_token(TOK_FUN, buf);
					else if(strcmp(buf, "end") == 0)
						add_token(TOK_END, buf);
					else
						add_token(TOK_IDENT, buf);
					printf("ident: >>%s<<\n", buf);
					reset_buf();
					mode = TOK_NONE;
					skip_ws();
				}
				break;
			case TOK_LITERAL_INT:
				if(isdigit(ch)) {
					accept_ch();
					next_ch();
				}
				else {
					finalise_buf();
					add_token(TOK_LITERAL_INT, buf);
					printf("int_literal: >>%s<<\n", buf);
					reset_buf();
					mode = TOK_NONE;
					skip_ws();
				}
				break;
			default:
				printf("ERROR: unknown mode\n");
				return;
		}
	} while(ch != EOF);
	add_token(TOK_EOF, 0);
}


/******************************************************************************/
/*                                                                            */
/*   PARSER                                                                   */
/*                                                                            */
/******************************************************************************/

int parse_pos;

enum node_type { NODE_ROOT, NODE_FUNBODY, NODE_FUNDEF, NODE_FUNCALL,
	NODE_VAR_USE, NODE_LITERAL_INT };

struct data_type {
	char* name;
};

struct function;

struct node {
	struct node** childs;
	int child_count;
	char* name;
	enum node_type type;
	char* val;
	char* data_type_str;
	struct data_type* data_type;
	struct function* fun;
};

struct node* node_new(enum node_type type) {
	struct node* node = malloc(sizeof(struct node));
	node->childs = 0;
	node->child_count = 0;
	node->name = 0;
	node->type = type;
	node->val = 0;
	node->data_type_str = 0;
	node->data_type = 0;
	node->fun = 0;
	return node;
}

void node_add_child(struct node* parent, struct node* child) {
	int new_size = sizeof(struct node*) * (++parent->child_count);
	parent->childs = realloc(parent->childs, new_size);
	parent->childs[parent->child_count-1] = child;
}

struct function_parameter {
	char* data_type_str;
	struct data_type* data_type;
	char* name;
};

struct function {
	char* name;
	char* return_type_str;
	struct data_type* return_type;
	struct function_parameter* params;
	int param_count;
	struct node* body;
};

struct function* function_new() {
	struct function* fun = malloc(sizeof(struct function));
	fun->name = 0;
	fun->return_type_str = 0;
	fun->return_type = 0;
	fun->params = 0;
	fun->param_count = 0;
	fun->body = node_new(NODE_FUNBODY);
}

void function_add_param(struct function* fun, char* name, char* type) {
	int new_size = sizeof(struct function_parameter) * (++fun->param_count);
	fun->params = realloc(fun->params, new_size);
	fun->params[fun->param_count-1].name = name;
	fun->params[fun->param_count-1].data_type_str = type;
}

struct token* expect(enum token_type type) {
	if(type == tokens[parse_pos].type) {
		++parse_pos;
		return &tokens[parse_pos-1];
	}
	else {
		printf("expected (%d), got (%d), at (%d)\n",
		       type, tokens[parse_pos].type, parse_pos);
		exit(-1);
	}
}

int is_next_tok(enum token_type type) {
	return tokens[parse_pos].type == type;
}

struct node* p_expr();

struct node* p_func_call() {
	struct node* node = node_new(NODE_FUNCALL);
	node->name = expect(TOK_IDENT)->str;
	expect(TOK_PAREN_OPEN);
	printf("1\n");
	if(tokens[parse_pos].type != TOK_PAREN_CLOSE) {
		printf("2\n");
		node_add_child(node, p_expr());
		printf("3\n");
		while(tokens[parse_pos].type == TOK_COMMA) {
			printf("4\n");
			expect(TOK_COMMA);
			node_add_child(node, p_expr());
		}
	}
	expect(TOK_PAREN_CLOSE);
	return node;
}

struct node* p_var_use() {
	struct node* node = node_new(NODE_VAR_USE);
	node->name = expect(TOK_IDENT)->str;
	return node;
}

struct node* p_expr() {
	switch(tokens[parse_pos].type) {
		case TOK_IDENT:
			if(tokens[parse_pos+1].type == TOK_PAREN_OPEN)
				return p_func_call();
			else
				return p_var_use();
		case TOK_LITERAL_INT:
		{
			struct node* node = node_new(NODE_LITERAL_INT);
			node->val = expect(TOK_LITERAL_INT)->str;
			return node;
		}
		default:
			printf("can't parse expression, (%d)\n", tokens[parse_pos].type);
			exit(-1);
	}
}

struct node* p_statement() {
	printf("p_statement\n");
	struct node* node;
	switch(tokens[parse_pos].type) {
		default:
			node = p_expr();
			expect(TOK_NEWLINE);
	}
	return node;
}

struct function* p_fun() {
	struct function* fun = function_new();
	expect(TOK_FUN);
	fun->name = expect(TOK_IDENT)->str;
	expect(TOK_PAREN_OPEN);
	if(tokens[parse_pos].type != TOK_PAREN_CLOSE) {
		for(;;) {
			char* name;
			char* type;
			name = expect(TOK_IDENT)->str;
			expect(TOK_COLON);
			type = expect(TOK_IDENT)->str;
			function_add_param(fun, name, type);
			if(tokens[parse_pos].type == TOK_COMMA)
				expect(TOK_COMMA);
			else
				break;
		}
	}
	expect(TOK_PAREN_CLOSE);
	if(tokens[parse_pos].type == TOK_COLON) {
		expect(TOK_COLON);
		fun->return_type_str = expect(TOK_IDENT)->str;
	}
	expect(TOK_NEWLINE);
	while(!is_next_tok(TOK_END)) {
		node_add_child(fun->body, p_statement());
	}
	expect(TOK_END);
	expect(TOK_NEWLINE);
	return fun;
}

struct function** functions;
int function_count;

void add_function(struct function* function) {
	functions = realloc(functions, sizeof(struct function*)*(++function_count));
	functions[function_count-1] = function;
}


void parse() {
	functions = 0;
	function_count = 0;
	parse_pos = 0;
	while(tokens[parse_pos].type != TOK_EOF) {
		switch(tokens[parse_pos].type) {
			case TOK_NEWLINE:
				break;
			case TOK_FUN:
				add_function(p_fun());
		}
	}
}

/******************************************************************************/
/*                                                                            */
/*   ANNOTATE                                                                 */
/*                                                                            */
/******************************************************************************/

struct data_type* data_types;
int data_type_count;

void add_data_type(char* name) {
	int new_size = sizeof(struct data_type) * (++data_type_count);
	data_types = realloc(data_types, new_size);
	data_types[data_type_count-1].name = name;
}

void init_data_types() { /* internal data types */
	data_types = 0;
	data_type_count = 0;
	add_data_type("void");
	add_data_type("i8");
	add_data_type("i16");
	add_data_type("i32");
	add_data_type("i64");
	add_data_type("u8");
	add_data_type("u16");
	add_data_type("u32");
	add_data_type("u64");
	add_data_type("f32");
	add_data_type("f64");
}

struct data_type* get_data_type(char* name) {
	for(int i = 0; i < data_type_count; ++i) {
		if(strcmp(name, data_types[i].name) == 0)
			return &data_types[i];
	}
	return 0;
}

struct function* get_function(char* name, struct data_type** params, int param_count) {
	for(int i = 0; i < function_count; ++i) {
		struct function* fun = functions[i];
		if(strcmp(fun->name, name) == 0 && fun->param_count == param_count) {
			for(int j = 0; j < param_count; ++j) {
				if(fun->params[j].data_type != params[j])
					goto tryagain;
			}
			return fun;
		}
		tryagain: ;
	}
	return 0;
}


void annotate_data_type(struct data_type** data_type, char* name) {
	if(name == 0) { /* nullpointer == void */
		*data_type = &data_types[0];
		return;
	}

	for(int i = 0; i < data_type_count; ++i) {
		if(strcmp(data_types[i].name, name) == 0) {
			*data_type = &data_types[i];
			return;
		}
	}
	
	printf("unknown datatype '%s'\n", name);
	exit(-1);
}

void annotate_function_data_types(struct function* fun) {
	annotate_data_type(&fun->return_type, fun->return_type_str);
	for(int i = 0; i < fun->param_count; ++i) {
		annotate_data_type(&fun->params[i].data_type, fun->params[i].data_type_str);
	}
}

void annotate_node_data_types(struct node* node) {
	for(int i = 0; i < node->child_count; ++i) {
		annotate_node_data_types(node->childs[i]);
	}

	switch(node->type) {
		case NODE_LITERAL_INT:
			node->data_type = get_data_type("i32"); /* temporary */
			break;
		case NODE_FUNCALL: {
			int size = sizeof(struct data_type*) * node->child_count;
			struct data_type** params = malloc(size);
			for(int i = 0; i < node->child_count; ++i) {
				params[i] = node->childs[i]->data_type;
			}
			struct function* fun = get_function(node->name, params, node->child_count);
			if(!fun) {
				printf("error, function %s(", node->name);
				for(int i = 0; i < node->child_count; ++i) {
					printf("%s", params[i]->name);
					if(i < node->child_count-1)
						printf(", ");
				}
				printf(") not found but used\n");
				exit(-1);
			}
			node->fun = fun;
			node->data_type = fun->return_type;
			break;
		}
		case NODE_FUNBODY:
			break;
		default:
			printf("error, not yet annotating %i\n", node->type);
			exit(-1);
	}
}

void annotate_data_types() {
	for(int i = 0; i < function_count; ++i) {
		printf("%s\n", functions[i]->name);
		annotate_function_data_types(functions[i]);
	}
	
	for(int i = 0; i < function_count; ++i) {
		printf("%s\n", functions[i]->name);
		annotate_node_data_types(functions[i]->body);
	}
}

/******************************************************************************/
/*                                                                            */
/*   MAIN                                                                     */
/*                                                                            */
/******************************************************************************/

void print_tokens() {
	for(int i = 0; i < token_len; ++i)
		printf("TOKEN %d %s\n", tokens[i].type, tokens[i].str);
}

void print_indent(int num) {
	for(int i = 0; i < num; ++i) printf("  ");
}


void print_data_type(struct data_type* data_type) {
	printf("%s", data_type->name);
}

void print_mangled_function_name(struct function* fun) {
	printf("%s__%s", fun->name, fun->return_type->name);
	for(int i = 0; i < fun->param_count; ++i)
		printf("__%s", fun->params[i].data_type->name);
}

void print_tree(struct node* node, int indent) {
	switch(node->type) {
		case NODE_ROOT:
			for(int j = 0; j < node->child_count; ++j) {
				print_tree(node->childs[j], 0);
				printf("\n");
			}
			break;
		case NODE_FUNCALL:
			print_indent(indent);
			print_mangled_function_name(node->fun);
			printf("(\n");
			for(int j = 0; j < node->child_count; ++j) {
				print_tree(node->childs[j], indent+1);
				if(j < node->child_count-1)
					printf(",\n");
				else
					printf("\n");
			}
			print_indent(indent);
			printf(")");
			break;
		case NODE_VAR_USE:
			print_indent(indent);
			printf("%s", node->name);
			break;
		case NODE_LITERAL_INT:
			print_indent(indent);
			printf("%s", node->val);
			break;
		default:
			printf("(UNKNOWN TOKEN)");
	}
}

void print_functions() {
	for(int i = 0; i < function_count; ++i) {
		struct function* fun = functions[i];
		print_data_type(fun->return_type);
		printf(" ");
		print_mangled_function_name(fun);
		printf("(");
		for(int i = 0; i < fun->param_count; ++i) {
			print_data_type(fun->params[i].data_type);
			printf(" %s", fun->params[i].name);
			if(i < fun->param_count-1)
				printf(", ");
		}
		printf(")\n");
		for(int j = 0; j < fun->body->child_count; ++j) {
			print_tree(fun->body->childs[j], 1);
			printf(";\n");
		}
		printf("}\n\n");
	}
}

int main() {
	tokenise();
	print_tokens();
	parse();
	init_data_types();
	annotate_data_types();
	print_functions();
	printf("-- %d\n", function_count);
	return 0;
}

/* TODO:
   - have parameter types and return types be part of function signature
*/

/* NOTES:
   - value types can be stack or heap allocated behind the scenes,
     having the compiler insert malloc/free style syscalls where needed.
*/