#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
/******************************************************************************/
/* */
/* TOKENISER */
/* */
/******************************************************************************/
enum token_type {
TOK_NONE,
TOK_PAREN_OPEN, TOK_PAREN_CLOSE, TOK_COMMA, TOK_COLON,
TOK_NEWLINE, TOK_EOF,
TOK_FUN, TOK_END,
TOK_IDENT, TOK_LITERAL_INT
};
struct token {
enum token_type type;
char* str;
};
int ch;
char* buf;
int buf_len;
struct token* tokens;
int token_len;
int isidentch(int c) {
return isalnum(c) || c == '_';
}
void next_ch() {
ch = getchar();
}
void accept_ch() {
buf = realloc(buf, ++buf_len);
buf[buf_len-1] = (char)ch;
}
void skip_ws() {
//printf("attempting to skip whitespace, next (%d)\n", ch);
while(ch == ' ' || ch == '\t') {
//printf("skipping ws: (%d)\n", ch);
next_ch();
}
}
void finalise_buf() {
buf = realloc(buf, ++buf_len);
buf[buf_len-1] = 0;
}
void reset_buf() { /* warning, memory leaking by design */
buf = 0;
buf_len = 0;
}
void add_token(enum token_type type, char* str) {
tokens = realloc(tokens, sizeof(struct token) * (++token_len));
tokens[token_len - 1].type = type;
tokens[token_len - 1].str = str;
}
void tokenise() {
enum token_type mode = TOK_NONE;
reset_buf();
tokens = 0;
token_len = 0;
next_ch();
do {
printf(">> %c <<\n", ch);
switch(mode) {
case TOK_NONE:
if(ch == '(') {
add_token(TOK_PAREN_OPEN, 0);
next_ch();
skip_ws();
}
else if(ch == ')') {
add_token(TOK_PAREN_CLOSE, 0);
next_ch();
skip_ws();
}
else if(ch == ',') {
add_token(TOK_COMMA, 0);
next_ch();
skip_ws();
}
else if(ch == ':') {
add_token(TOK_COLON, 0);
next_ch();
skip_ws();
}
else if(ch == '\n' || ch == '\r') {
add_token(TOK_NEWLINE, 0);
next_ch();
/* collapse all versions, and multiples to a single token */
while(ch == '\n' || ch == '\r') next_ch();
skip_ws();
}
else if(isalpha(ch)) {
mode = TOK_IDENT;
accept_ch();
next_ch();
}
else if(isdigit(ch)) {
mode = TOK_LITERAL_INT;
accept_ch();
next_ch();
}
else {
printf("ERROR: unexpected character '%c' (%d)\n", ch, ch);
return;
}
break;
case TOK_IDENT:
if(isidentch(ch)) {
accept_ch();
next_ch();
}
else {
finalise_buf();
if(strcmp(buf, "fun") == 0)
add_token(TOK_FUN, buf);
else if(strcmp(buf, "end") == 0)
add_token(TOK_END, buf);
else
add_token(TOK_IDENT, buf);
printf("ident: >>%s<<\n", buf);
reset_buf();
mode = TOK_NONE;
skip_ws();
}
break;
case TOK_LITERAL_INT:
if(isdigit(ch)) {
accept_ch();
next_ch();
}
else {
finalise_buf();
add_token(TOK_LITERAL_INT, buf);
printf("int_literal: >>%s<<\n", buf);
reset_buf();
mode = TOK_NONE;
skip_ws();
}
break;
default:
printf("ERROR: unknown mode\n");
return;
}
} while(ch != EOF);
add_token(TOK_EOF, 0);
}
/******************************************************************************/
/* */
/* PARSER */
/* */
/******************************************************************************/
int parse_pos;
enum node_type { NODE_ROOT, NODE_FUNBODY, NODE_FUNDEF, NODE_FUNCALL,
NODE_VAR_USE, NODE_LITERAL_INT };
struct data_type {
char* name;
};
struct function;
struct node {
struct node** childs;
int child_count;
char* name;
enum node_type type;
char* val;
char* data_type_str;
struct data_type* data_type;
struct function* fun;
};
struct node* node_new(enum node_type type) {
struct node* node = malloc(sizeof(struct node));
node->childs = 0;
node->child_count = 0;
node->name = 0;
node->type = type;
node->val = 0;
node->data_type_str = 0;
node->data_type = 0;
node->fun = 0;
return node;
}
void node_add_child(struct node* parent, struct node* child) {
int new_size = sizeof(struct node*) * (++parent->child_count);
parent->childs = realloc(parent->childs, new_size);
parent->childs[parent->child_count-1] = child;
}
struct function_parameter {
char* data_type_str;
struct data_type* data_type;
char* name;
};
struct function {
char* name;
char* return_type_str;
struct data_type* return_type;
struct function_parameter* params;
int param_count;
struct node* body;
};
struct function* function_new() {
struct function* fun = malloc(sizeof(struct function));
fun->name = 0;
fun->return_type_str = 0;
fun->return_type = 0;
fun->params = 0;
fun->param_count = 0;
fun->body = node_new(NODE_FUNBODY);
}
void function_add_param(struct function* fun, char* name, char* type) {
int new_size = sizeof(struct function_parameter) * (++fun->param_count);
fun->params = realloc(fun->params, new_size);
fun->params[fun->param_count-1].name = name;
fun->params[fun->param_count-1].data_type_str = type;
}
struct token* expect(enum token_type type) {
if(type == tokens[parse_pos].type) {
++parse_pos;
return &tokens[parse_pos-1];
}
else {
printf("expected (%d), got (%d), at (%d)\n",
type, tokens[parse_pos].type, parse_pos);
exit(-1);
}
}
int is_next_tok(enum token_type type) {
return tokens[parse_pos].type == type;
}
struct node* p_expr();
struct node* p_func_call() {
struct node* node = node_new(NODE_FUNCALL);
node->name = expect(TOK_IDENT)->str;
expect(TOK_PAREN_OPEN);
printf("1\n");
if(tokens[parse_pos].type != TOK_PAREN_CLOSE) {
printf("2\n");
node_add_child(node, p_expr());
printf("3\n");
while(tokens[parse_pos].type == TOK_COMMA) {
printf("4\n");
expect(TOK_COMMA);
node_add_child(node, p_expr());
}
}
expect(TOK_PAREN_CLOSE);
return node;
}
struct node* p_var_use() {
struct node* node = node_new(NODE_VAR_USE);
node->name = expect(TOK_IDENT)->str;
return node;
}
struct node* p_expr() {
switch(tokens[parse_pos].type) {
case TOK_IDENT:
if(tokens[parse_pos+1].type == TOK_PAREN_OPEN)
return p_func_call();
else
return p_var_use();
case TOK_LITERAL_INT:
{
struct node* node = node_new(NODE_LITERAL_INT);
node->val = expect(TOK_LITERAL_INT)->str;
return node;
}
default:
printf("can't parse expression, (%d)\n", tokens[parse_pos].type);
exit(-1);
}
}
struct node* p_statement() {
printf("p_statement\n");
struct node* node;
switch(tokens[parse_pos].type) {
default:
node = p_expr();
expect(TOK_NEWLINE);
}
return node;
}
struct function* p_fun() {
struct function* fun = function_new();
expect(TOK_FUN);
fun->name = expect(TOK_IDENT)->str;
expect(TOK_PAREN_OPEN);
if(tokens[parse_pos].type != TOK_PAREN_CLOSE) {
for(;;) {
char* name;
char* type;
name = expect(TOK_IDENT)->str;
expect(TOK_COLON);
type = expect(TOK_IDENT)->str;
function_add_param(fun, name, type);
if(tokens[parse_pos].type == TOK_COMMA)
expect(TOK_COMMA);
else
break;
}
}
expect(TOK_PAREN_CLOSE);
if(tokens[parse_pos].type == TOK_COLON) {
expect(TOK_COLON);
fun->return_type_str = expect(TOK_IDENT)->str;
}
expect(TOK_NEWLINE);
while(!is_next_tok(TOK_END)) {
node_add_child(fun->body, p_statement());
}
expect(TOK_END);
expect(TOK_NEWLINE);
return fun;
}
struct function** functions;
int function_count;
void add_function(struct function* function) {
functions = realloc(functions, sizeof(struct function*)*(++function_count));
functions[function_count-1] = function;
}
void parse() {
functions = 0;
function_count = 0;
parse_pos = 0;
while(tokens[parse_pos].type != TOK_EOF) {
switch(tokens[parse_pos].type) {
case TOK_NEWLINE:
break;
case TOK_FUN:
add_function(p_fun());
}
}
}
/******************************************************************************/
/* */
/* ANNOTATE */
/* */
/******************************************************************************/
struct data_type* data_types;
int data_type_count;
void add_data_type(char* name) {
int new_size = sizeof(struct data_type) * (++data_type_count);
data_types = realloc(data_types, new_size);
data_types[data_type_count-1].name = name;
}
void init_data_types() { /* internal data types */
data_types = 0;
data_type_count = 0;
add_data_type("void");
add_data_type("i8");
add_data_type("i16");
add_data_type("i32");
add_data_type("i64");
add_data_type("u8");
add_data_type("u16");
add_data_type("u32");
add_data_type("u64");
add_data_type("f32");
add_data_type("f64");
}
struct data_type* get_data_type(char* name) {
for(int i = 0; i < data_type_count; ++i) {
if(strcmp(name, data_types[i].name) == 0)
return &data_types[i];
}
return 0;
}
struct function* get_function(char* name, struct data_type** params, int param_count) {
for(int i = 0; i < function_count; ++i) {
struct function* fun = functions[i];
if(strcmp(fun->name, name) == 0 && fun->param_count == param_count) {
for(int j = 0; j < param_count; ++j) {
if(fun->params[j].data_type != params[j])
goto tryagain;
}
return fun;
}
tryagain: ;
}
return 0;
}
void annotate_data_type(struct data_type** data_type, char* name) {
if(name == 0) { /* nullpointer == void */
*data_type = &data_types[0];
return;
}
for(int i = 0; i < data_type_count; ++i) {
if(strcmp(data_types[i].name, name) == 0) {
*data_type = &data_types[i];
return;
}
}
printf("unknown datatype '%s'\n", name);
exit(-1);
}
void annotate_function_data_types(struct function* fun) {
annotate_data_type(&fun->return_type, fun->return_type_str);
for(int i = 0; i < fun->param_count; ++i) {
annotate_data_type(&fun->params[i].data_type, fun->params[i].data_type_str);
}
}
void annotate_node_data_types(struct node* node) {
for(int i = 0; i < node->child_count; ++i) {
annotate_node_data_types(node->childs[i]);
}
switch(node->type) {
case NODE_LITERAL_INT:
node->data_type = get_data_type("i32"); /* temporary */
break;
case NODE_FUNCALL: {
int size = sizeof(struct data_type*) * node->child_count;
struct data_type** params = malloc(size);
for(int i = 0; i < node->child_count; ++i) {
params[i] = node->childs[i]->data_type;
}
struct function* fun = get_function(node->name, params, node->child_count);
if(!fun) {
printf("error, function %s(", node->name);
for(int i = 0; i < node->child_count; ++i) {
printf("%s", params[i]->name);
if(i < node->child_count-1)
printf(", ");
}
printf(") not found but used\n");
exit(-1);
}
node->fun = fun;
node->data_type = fun->return_type;
break;
}
case NODE_FUNBODY:
break;
default:
printf("error, not yet annotating %i\n", node->type);
exit(-1);
}
}
void annotate_data_types() {
for(int i = 0; i < function_count; ++i) {
printf("%s\n", functions[i]->name);
annotate_function_data_types(functions[i]);
}
for(int i = 0; i < function_count; ++i) {
printf("%s\n", functions[i]->name);
annotate_node_data_types(functions[i]->body);
}
}
/******************************************************************************/
/* */
/* MAIN */
/* */
/******************************************************************************/
void print_tokens() {
for(int i = 0; i < token_len; ++i)
printf("TOKEN %d %s\n", tokens[i].type, tokens[i].str);
}
void print_indent(int num) {
for(int i = 0; i < num; ++i) printf(" ");
}
void print_data_type(struct data_type* data_type) {
printf("%s", data_type->name);
}
void print_mangled_function_name(struct function* fun) {
printf("%s__%s", fun->name, fun->return_type->name);
for(int i = 0; i < fun->param_count; ++i)
printf("__%s", fun->params[i].data_type->name);
}
void print_tree(struct node* node, int indent) {
switch(node->type) {
case NODE_ROOT:
for(int j = 0; j < node->child_count; ++j) {
print_tree(node->childs[j], 0);
printf("\n");
}
break;
case NODE_FUNCALL:
print_indent(indent);
print_mangled_function_name(node->fun);
printf("(\n");
for(int j = 0; j < node->child_count; ++j) {
print_tree(node->childs[j], indent+1);
if(j < node->child_count-1)
printf(",\n");
else
printf("\n");
}
print_indent(indent);
printf(")");
break;
case NODE_VAR_USE:
print_indent(indent);
printf("%s", node->name);
break;
case NODE_LITERAL_INT:
print_indent(indent);
printf("%s", node->val);
break;
default:
printf("(UNKNOWN TOKEN)");
}
}
void print_functions() {
for(int i = 0; i < function_count; ++i) {
struct function* fun = functions[i];
print_data_type(fun->return_type);
printf(" ");
print_mangled_function_name(fun);
printf("(");
for(int i = 0; i < fun->param_count; ++i) {
print_data_type(fun->params[i].data_type);
printf(" %s", fun->params[i].name);
if(i < fun->param_count-1)
printf(", ");
}
printf(")\n");
for(int j = 0; j < fun->body->child_count; ++j) {
print_tree(fun->body->childs[j], 1);
printf(";\n");
}
printf("}\n\n");
}
}
int main() {
tokenise();
print_tokens();
parse();
init_data_types();
annotate_data_types();
print_functions();
printf("-- %d\n", function_count);
return 0;
}
/* TODO:
- have parameter types and return types be part of function signature
*/
/* NOTES:
- value types can be stack or heap allocated behind the scenes,
having the compiler insert malloc/free style syscalls where needed.
*/