#!/usr/bin/env python
#
# Implements the arithmetic/logic unit of the KCP53000 IXU. Yes, I meant
# KCP53000, not KCP53000B. This ALU is *big* and a bit *sluggish* (mainly
# because of its barrel shifter). But, it *works*.
#
# Fun fact: This is perhaps my 2nd most re-used circuit to come out of the
# Kestrel Computer Project. My 1st being the KIA, of course.
#
# The current logic assumes a 64-bit ALU, even though there is some
# parameterization in the construction of the module. This will be cleaned up
# as I build out devices with other data widths.
from functools import reduce
import inspect
from nmigen.test.tools import FHDLTestCase
from nmigen.back.pysim import Simulator, Delay
from nmigen import (
Signal, Module, ClockDomain, Elaboratable, ResetSignal, Const, Repl, Cat,
)
from nmigen.hdl.ast import Assume, Past, Assert, Stable, Rose, Fell
def create_interface(self, xlen=64):
"Creates the ALU interface."
# ALU inputs and output. Generally, but not necessarily always, input A
# corresponds to register 1, and B to register 2.
self.a = Signal(xlen)
self.b = Signal(xlen)
self.out = Signal(xlen)
# Status flags
self.cin = Signal(1)
self.cout = Signal(1)
self.vout = Signal(1)
self.zout = Signal(1)
# Function Selects
self.sums = Signal(1)
self.ands = Signal(1)
self.xors = Signal(1)
self.invb = Signal(1)
self.lshs = Signal(1)
self.rshs = Signal(1)
self.ltus = Signal(1)
self.ltss = Signal(1)
class ALU(Elaboratable):
"""
Implements the logic for the ALU. The original Verilog for the KCP53000
ALU can be found here:
https://github.com/sam-falvo/kestrel/blob/master/cores/KCP53K/processor/rtl/verilog/alu.v
"""
def __init__(self, formal=False):
super().__init__()
create_interface(self)
def elaborate(self, platform):
m = Module()
comb = m.d.comb
xlen = self.a.nbits
a = self.a
b = Signal(self.b.nbits)
b.eq(self.b ^ Repl(self.invb, xlen))
# This apparently very strange bit of logic is intended to reuse as
# much as we can from the synthesis tools, while still gaining access
# to the ultimate and penultimate carry signals, so that we can
# properly compute the overflow flag setting. This allows us to
# implement SLT and SLTU instructions, for example.
# Add bits 0 to xlen-1 into an xlen-length signal. The top bit is the
# carry that would have fed into that bit position.
sum_lo = Signal(xlen)
c_lo = Signal(1)
comb += [
sum_lo.eq(a[0:xlen-1] + b[0:xlen-1] + Cat(self.cin, Const(0, xlen-1))),
c_lo.eq(sum_lo[-1]),
]
# Add the top-most bits of our inputs, accounting for the carry-out
# generated earlier.
sum_hi = Signal(2)
c_hi = Signal(1)
comb += [
sum_hi.eq(a[xlen-1] + b[xlen-1] + c_lo),
c_hi.eq(sum_hi[1]),
]
# The complete sum is the concatenation of sum_hi and sum_lo, masked
# by the sums enable input.
sums = Signal(xlen)
comb += [
sums.eq(Cat(sum_lo, sum_hi[0]) & Repl(self.sums, xlen)),
self.zout.eq(sums == Const(0, xlen)),
self.cout.eq(c_hi),
self.vout.eq(c_hi ^ c_lo),
]
# Boolean logic. AND and XOR are accounted for. To see the results
# of an OR operation, enable *both* self.ands and self.xors together.
ands = Signal(xlen)
xors = Signal(xlen)
comb += [
ands.eq((a & b) & Repl(self.ands, xlen)),
xors.eq((a ^ b) & Repl(self.xors, xlen)),
]
# Magnitude comparison logic. Much of this logic was taken from
# studying how older CPUs implemented signed vs unsigned less-than
# comparisons (especially influenced by the 68000, IIRC).
ltss = Signal(xlen)
ltus = Signal(xlen)
comb += [
ltss.eq(Cat(sums[xlen-1] ^ self.vout, Const(0, xlen-1)) & Repl(self.ltss, xlen)),
ltus.eq(Cat(~self.cout, Const(0, xlen-1)) & Repl(self.ltus, xlen)),
]
# Now we get to the big, bulky logic that implements the barrel shifter.
# If your code typically shifts by less than four bits at a time, and you
# want a faster processor, you might want to replace this logic with a
# state machine that shifts one bit at a time. That would probably
# reduce critical path length in the ALU, allowing the processor to run
# at a higher clock speed to compensate.
# Start with the left-shifts. We take the strategy of shifting using
# cascading sequences of MUXes. TODO: Refactor this logic somehow.
lsh1 = Signal(xlen)
with m.Switch(self.b[0]):
with m.Case(Const(0)):
comb += lsh1.eq(a)
with m.Case(Const(1)):
comb += lsh1.eq(Cat(Const(0, 1), a[0:xlen-1]))
lsh2 = Signal(xlen)
with m.Switch(self.b[1]):
with m.Case(Const(0)):
comb += lsh2.eq(lsh1)
with m.Case(Const(1)):
comb += lsh2.eq(Cat(Const(0, 2), lsh1[0:xlen-2]))
lsh4 = Signal(xlen)
with m.Switch(self.b[2]):
with m.Case(Const(0)):
comb += lsh4.eq(lsh2)
with m.Case(Const(1)):
comb += lsh4.eq(Cat(Const(0, 4), lsh2[0:xlen-4]))
lsh8 = Signal(xlen)
with m.Switch(self.b[3]):
with m.Case(Const(0)):
comb += lsh8.eq(lsh4)
with m.Case(Const(1)):
comb += lsh8.eq(Cat(Const(0, 8), lsh4[0:xlen-8]))
lsh16 = Signal(xlen)
with m.Switch(self.b[4]):
with m.Case(Const(0)):
comb += lsh16.eq(a)
with m.Case(Const(1)):
comb += lsh16.eq(Cat(Const(0, 16), lsh8[0:xlen-16]))
lsh32 = Signal(xlen)
with m.Switch(self.b[5]):
with m.Case(Const(0)):
comb += lsh32.eq(lsh16)
with m.Case(Const(1)):
comb += lsh32.eq(Cat(Const(0, 32), lsh16[0:xlen-32]))
lshs = Signal(xlen)
comb += lshs.eq(lsh32 & Repl(self.lshs, 64))
# Next up, the infamous right-shift. This is complicated by the
# need to worry about sign-extension as we shift.
rsh1 = Signal(xlen)
with m.Switch(self.b[0]):
with m.Case(Const(0)):
comb += rsh1.eq(a)
with m.Case(Const(1)):
comb += rsh1.eq(Cat(a[1:xlen], a[xlen-1] & self.cin))
rsh2 = Signal(xlen)
with m.Switch(self.b[1]):
with m.Case(Const(0)):
comb += rsh2.eq(rsh1)
with m.Case(Const(1)):
comb += rsh2.eq(Cat(rsh1[2:xlen], Repl(rsh1[xlen-1] & self.cin, 2)))
rsh4 = Signal(xlen)
with m.Switch(self.b[2]):
with m.Case(Const(0)):
comb += rsh4.eq(rsh2)
with m.Case(Const(1)):
comb += rsh4.eq(Cat(rsh2[4:xlen], Repl(rsh2[xlen-1] & self.cin, 4)))
rsh8 = Signal(xlen)
with m.Switch(self.b[3]):
with m.Case(Const(0)):
comb += rsh8.eq(rsh4)
with m.Case(Const(1)):
comb += rsh8.eq(Cat(rsh4[8:xlen], Repl(rsh4[xlen-1] & self.cin, 8)))
rsh16 = Signal(xlen)
with m.Switch(self.b[4]):
with m.Case(Const(0)):
comb += rsh16.eq(rsh8)
with m.Case(Const(1)):
comb += rsh16.eq(Cat(rsh8[16:xlen], Repl(rsh8[xlen-1] & self.cin, 16)))
rsh32 = Signal(xlen)
with m.Switch(self.b[5]):
with m.Case(Const(0)):
comb += rsh32.eq(rsh16)
with m.Case(Const(1)):
comb += rsh32.eq(Cat(rsh16[32:xlen], Repl(rsh16[xlen-1] & self.cin, 32)))
rshs = Signal(xlen)
comb += rshs.eq(rsh32 & Repl(self.rshs, 64))
# Now we can drive our final output.
comb += self.out.eq(
sums | ands | xors | ltss | ltus | lshs | rshs
)