//  npu.c   (nat GPU)   19 Mar 2012
//		version 2 ISA   5 Dec 2012
//     	P. Chongstitvatana

#include <stdlib.h>
#include "npu.h"

#define  NPE  4		// number of PEs

// definition of registers and memory
int R[NPE][32], LDS[NPE], PC, IR, RNG;
int M[2000];

int op, a1, a2, a3, disp;
int runflag;

// instruction decoder
//  op:8 ads:14 r1:5 r2:5
#define opcode() ((IR >> 24) & 0x0ff)
#define ads()	 ((IR >> 10) & 0x03fff)
#define r1()	 ((IR >> 5) & 0x01f)
#define r2()	 (IR & 0x01f)

void error(char *mess){
	printf("error: %s\n", mess);
	exit(0);
}

// all R[k] are zero
int testrz(int k){
	int i;
	for(i = 0; i < NPE; i++)
		if( R[i][k] != 0 ) return 0;
	return 1;
}

void execute(int op){
	int i;
	switch(op){
	case  xLd:  LDS[a2] = M[a1]; break;
	case  xSt:  M[a1] = LDS[a2]; break;
	case  xLdx:	LDS[a1] = M[R[a1][a2]+R[a1][a3]]; break;
	case  xStx:	M[R[a1][a2]+R[a1][a3]] = LDS[a1]; break;
	case  xLdr:
		for(i = 0; i < NPE; i++)
			R[i][a1] = LDS[i];
		break;
	case  xStr:
		for(i = 0; i < NPE; i++)
			LDS[i] = R[i][a1];
		break;
	case  xLdw:
		for(i = 0; i < NPE; i++)
			LDS[i] = M[a1];
		break;
	case  xBc:
		for(i = 0; i < NPE; i++)
			R[i][a1] = LDS[a2];
		break;
	case  xAdd:
		for(i = 0; i < NPE; i++)
			R[i][a1] = R[i][a2] + R[i][a3];
		break;
	case  xSub:
		for(i = 0; i < NPE; i++)
			R[i][a1] = R[i][a2] - R[i][a3];
		break;
	case  xMul:
		for(i = 0; i < NPE; i++)
			R[i][a1] = R[i][a2] * R[i][a3];
		break;
	case  xAshr:
		for(i = 0; i < NPE; i++)
			R[i][a2] = R[i][a3] >> a1;   // unsign !
		break;
	case  xAddi:
		for(i = 0; i < NPE; i++)
			R[i][a2] = R[i][a3] + disp;  // signex
		break;
	case  xAnd:
		for(i = 0; i < NPE; i++)
			R[i][a1] = R[i][a2] & R[i][a3];
		break;
	case  xOr:
		for(i = 0; i < NPE; i++)
			R[i][a1] = R[i][a2] | R[i][a3];
		break;
	case  xXor:
		for(i = 0; i < NPE; i++)
			R[i][a1] = R[i][a2] ^ R[i][a3];
		break;
	case  xLt:
		for(i = 0; i < NPE; i++)
			R[i][a1] = R[i][a2] < R[i][a3];
		break;
	case  xLe:
		for(i = 0; i < NPE; i++)
			R[i][a1] = R[i][a2] <= R[i][a3];
		break;
	case  xEq:
		for(i = 0; i < NPE; i++)
			R[i][a1] = R[i][a2] == R[i][a3];
		break;
	case  xJmp: PC = a1; break;
	case  xJz:	if( testrz(a2) ) PC = a1; break;
	case  xJnz:	if( !testrz(a2) ) PC = a1; break;
	case  xRnd: break;
	case  xMvt:
		for(i = 0; i < NPE; i++)
			if( R[i][a1] != 0 )
				R[i][a2] = R[i][a3];
		break;
	case  xSys:
		switch(a1){
		case 4: runflag = 0; break;
		}
		break;
	}
}


// a is 14-bit, sign extend
int signx(int a){
	if( a & 0x02000 ) return a | 0xffffc000;
	return a;
}

char opsym[][8] = {
  "ld", "st", "ldr", "str", "ldx", "stx", "ldw", "bc", "add", "sub",
  "mul", "ashr", "addi", "and", "or", "xor", "lt","le","eq", "jmp",
  "jz", "jnz", "rnd", "mvt", "sys", "inc", "dec", "clr", "mov"
};


// number of cycle per instruction
int inst_cycle[] = {
	9,9,7,7,9,9,9,7,7,7,
	7,7,7,7,7,7,7,7,7,6,
	6,7,7,7,7,7,7,7,7
};

void showInst(int op, int a1, int a2, int a3){
	printf("%d: %s %d %d %d\n",PC-1,opsym[op-10],signx(a1),a2,a3);
}

void dumpInternal(void){
	int i;
	// dump R[0,1,2,3], LDS[.]
//	printf("%d:\n",PC);
	printf("R[0] %d %d %d %d\n",R[0][0],R[1][0],R[2][0],R[3][0]);
	printf("R[1] %d %d %d %d\n",R[0][1],R[1][1],R[2][1],R[3][1]);
	printf("R[2] %d %d %d %d\n",R[0][2],R[1][2],R[2][2],R[3][2]);
	printf("R[3] %d %d %d %d\n",R[0][3],R[1][3],R[2][3],R[3][3]);
	printf("LDS  %d %d %d %d\n",LDS[0],LDS[1],LDS[2],LDS[3]);
}

// encode fields into an instruction
int enc4(int op, int a1, int a2, int a3){
	a1 = (a1 << 10) & 0x00fffc00;	// mask 14-bit
	return (op << 24 )| a1 |(a2 << 5)| a3;
}

void loadobj(void){
	int i, op, a1, a2, a3;
	int magic, start, len;
	scanf("%d",&magic);
	if(magic != MAGIC) error("not npu object file\n");
	scanf("%d%d", &start, &len);
	for(i = start; i < start+len; i++){
		scanf("%d%d%d%d",&op,&a1,&a2,&a3);
		M[i] = enc4(op,a1,a2,a3);
//		printf("%d: %d %d %d %d\n",i,op,a1,a2,a3);
//		printf("%d: %d\n",i,M[i] >> 22);
	}
	scanf("%d%d", &start, &len);	// data
	for(i = start; i < start+len; i++){
		scanf("%d",&a1);
		M[i] = a1;
//		printf("%d:: %d\n",i,a1);
	}
}

void init(void){
	loadobj();
	PC = 0;
	runflag = 1;
}

/*
void spy(void){
	printf("here\n");
	printf("op %d a1 %d a2 %d a3 %d\n",op,a1,a2,a3);
}
*/
int icnt, cycle;

int main(void){
	init();
	icnt = 0;
	cycle = 0;
	while(runflag){
		IR = M[PC]; 		// inst. fetch
		PC++;
		op = opcode();		// decode bit in IR
		a1 = ads();
		disp = signx(a1);
		a2 = r1();
		a3 = r2();
//		showInst(op,a1,a2,a3);
		execute(op);
//		dumpInternal();
		cycle += inst_cycle[op-10];
		icnt++;
		if(icnt > 1000) runflag = 0;
	}
	dumpInternal();
	printf("execute %d instructions %d cycles\n",icnt,cycle);
	return 0;
}
