NPU reconstruction  ISA  19 Mar 2012


4 PEs, each:  32x32 reg. R, one S (result), ALU
Local data store  4x32, z, t flags
random number generator  32 bits 4x8
main mem:  1Kx32
buf:  interface between LDS and Mem  4x32
PC, IR

instruction format

op:10 a1:10 a2:6 a3:6

ads 10 bits
ls 0,1,2,3
r 0..31

ld ls ads	M[ads] -> buf[ls]    ls = 0,1,2,3
st ads ls	buf[ls] -> M[ads]
lds		buf[.] -> LDS[.]
buf		LDS[.] -> buf[.]
ldr r 		LDS -> R[r]	     r = 0..31
str r           R[r] -> LDS
bc r ls		LDS[ls] -> all R[r]  broadcast

add r3 r1 r2	R[r1] + R[r2] -> R[r3]
sub r3 r1 r2
mul r3 r1 r2
ge r3 r1 r2	R[r1] >= R[r2] -> R[r3]   bool
eq r3 r1 r2 	R[r1] == R[r2] -> R[r3]
inc r		R[r] + 1 -> R[r]
dec r		R[r] - 1 -> R[r]
rnd r		random -> R[r]

jmp ads		ads -> pc
jz ads		if z ads -> pc	     z all LDS == 0
jnz ads		if !z ads -> pc
jt ads r	if R[r] != 0 ads -> pc	 all R true
jf ads r	if R[r] == 0 ads -> pc   all R false

pseudo
sys 1 r		print int all R[r]
sys 2 r		print char all R[r]
sys 3		print nl		
sys 4		stop simulation

npu isa version 2 (25 Mar 2012)

BUF[.] is redundant. ld/st can reach LDS directly. lds,buf are redundant.

ld ls ads 	M[ads] -> LDS[ls]
st ads ls	LDS[ls] -> M[ads]

LDS can be named "bus interface" (to join narrow 32, with wide 32x8 bus) or "broadcast unit" (because it can do broadcast to all LDS).  "load wide" and "broadcast r ls"

bc r ls 	LDS[ls] -> all R[r]
ldw ads		M[ads+i] -> all LDS[i], i = 0,1,2,3

**  added  5 Dec 2012   Long Live the King

xor r3 r1 r2	R[r1] ^ R[r2] -> R[r3]
lt r3 r1 r2	R[r1] < R[r2] -> R[r3]

addi r3 r1 #n   R[r1] + n -> R[r3]   (n unsigned 5-bit)
subi r3 r1 #n   R[r1] - n -> R[r3]
ashr r3 r1 #n	R[r1] >> n -> R[r3]  arith shift right

mv_t r3 r1 r2	if R[r3] == true R[r2] -> R[r1]  move if true

ldw ads		M[ads] -> all LDS    broadcast
* new meaning (simplified)

eliminate BUF, now load go directly to LDS
delete: lds, buf

mov r3 r1  ==  addi r3 r1 #0
inc r3     ==  addi r3 r3 #1
dec r3     ==  subi r3 r3 #1

-----------------------------------

Revised ISA  npu4   16 Dec 2012

aim:  introduce instructions to deal with data structure, so, we need indirect addressing.  The first idea is like:

ldx r3 r1 r2      R[r3] = M[ R[r1] + R[r2] ]
stx r3 r1 r2      M[ R[r1] + R[r2] ] = R[r3]

However, this will not work because they access memory directly, while NPU access memory through LDS.  Here is the necessary modification.

Index stored to LDS.  Using LDS indirect.

ldx ls          LDS[ls] = M[LDS[ls]] 
stx ls          M[LDS[sl]] = LDS[ls]

And refine some instruction to make the set more compact.

ISA NPU version 2

ld ls @ads	LDS[ls] = M[ads]     ls = 0,1,2,3
st ls @ads	M[ads] = LDS[ls]
ldr r 		R[r] = LDS	     r = 0..31
str r           LDS = R[r]
ldw @ads	LDS = M[ads]         load wide
bc r ls		R[r] = LDS[ls]       broadcast
ldx ls r1 r2    LDS[ls] = M[R[ls][r1]+R[ls][r2]]  load index
stx ls r1 r2    M[R[ls][r1]+R[ls][r2]] = LDS[ls]

add r3 r1 r2	R[r3] = R[r1] + R[r2]
sub r3 r1 r2    R[r3] = R[r1] - R[r2]
mul r3 r1 r2    R[r3] = R[r1] * R[r2]
ashr r3 r1 #n   R[r3] = R[r1] >> n
addi r3 r1 #n   R[r3] = R[r1] + n
and r3 r1 r2    R[r3] = R[r1] and R[r2]
or  r3 r1 r2    R[r3] = R[r1] or  R[r2]
xor r3 r1 r2    R[r3] = R[r1] xor R[r2]
lt r3 r1 r2     R[r3] = R[r1] <  R[r2]  
le r3 r1 r2     R[r3] = R[r1] <= R[r2]  
eq r3 r1 r2 	R[r3] = R[r1] == R[r2]

rnd r		R[r] = random 

jmp @ads	pc = ads
jz r @ads	if R[r] == 0, pc = ads 
jnz r @ads	if R[r] != 0, pc = ads

mv_t r3 r1 r2	if R[r3] != 0, R[r1] = R[r2]  move true 

equivalent

clr r           xor r r r 
inc r		addi r r #1
dec r		subi r r #1
mov r3 r1       addi r3 r1 #0

pseudo
	
sys 4		stop simulation

end
