#!/usr/bin/env python
#
# Implements the arithmetic/logic unit of the KCP53000 IXU. Yes, I meant
# KCP53000, not KCP53000B. This ALU is *big* and a bit *sluggish* (mainly
# because of its barrel shifter). But, it *works*.
#
# Fun fact: This is perhaps my 2nd most re-used circuit to come out of the
# Kestrel Computer Project. My 1st being the KIA, of course.
#
# The current logic assumes a 64-bit ALU, even though there is some
# parameterization in the construction of the module. This will be cleaned up
# as I build out devices with other data widths.
from functools import reduce
import inspect
from nmigen.test.tools import FHDLTestCase
from nmigen.back.pysim import Simulator, Delay
from nmigen import (
Signal, Module, ClockDomain, Elaboratable, ResetSignal, Const, Repl, Cat,
)
from nmigen.hdl.ast import Assume, Past, Assert, Stable, Rose, Fell, Mux
def create_interface(self, xlen=64):
"Creates the ALU interface."
# ALU inputs and output. Generally, but not necessarily always, input A
# corresponds to register 1, and B to register 2.
self.a = Signal(xlen)
self.b = Signal(xlen)
self.out = Signal(xlen)
# Status flags
self.cin = Signal(1)
self.cout = Signal(1)
self.vout = Signal(1)
self.zout = Signal(1)
# Function Selects
self.sums = Signal(1)
self.ands = Signal(1)
self.xors = Signal(1)
self.invb = Signal(1)
self.lshs = Signal(1)
self.rshs = Signal(1)
self.ltus = Signal(1)
self.ltss = Signal(1)
class ALU(Elaboratable):
"""
Implements the logic for the ALU. The original Verilog for the KCP53000
ALU can be found here:
https://github.com/sam-falvo/kestrel/blob/master/cores/KCP53K/processor/rtl/verilog/alu.v
"""
def __init__(self, xlen=64, formal=False):
super().__init__()
create_interface(self, xlen=xlen)
def elaborate(self, platform):
m = Module()
comb = m.d.comb
xlen = self.a.nbits
a = self.a
b = Signal(self.b.nbits)
comb += b.eq(self.b ^ Repl(self.invb, xlen))
# This apparently very strange bit of logic is intended to reuse as
# much as we can from the synthesis tools, while still gaining access
# to the ultimate and penultimate carry signals, so that we can
# properly compute the overflow flag setting. This allows us to
# implement SLT and SLTU instructions, for example.
# Add bits 0 to xlen-1 into an xlen-length signal. The top bit is the
# carry that would have fed into that bit position.
sum_lo = Signal(xlen)
c_lo = Signal(1)
comb += [
sum_lo.eq(a[0:xlen-1] + b[0:xlen-1] + Cat(self.cin, Const(0, xlen-1))),
c_lo.eq(sum_lo[xlen-1]),
]
# Add the top-most bits of our inputs, accounting for the carry-out
# generated earlier.
sum_hi = Signal(2)
c_hi = Signal(1)
comb += [
sum_hi.eq(a[xlen-1] + b[xlen-1] + c_lo),
c_hi.eq(sum_hi[1]),
]
# The complete sum is the concatenation of sum_hi and sum_lo, masked
# by the sums enable input.
sums = Signal(xlen)
comb += [
sums.eq(Cat(sum_lo[0:xlen-1], sum_hi[0]) & Repl(self.sums, xlen)),
self.zout.eq(sums == Const(0, xlen)),
self.cout.eq(c_hi),
self.vout.eq(c_hi ^ c_lo),
]
# Boolean logic. AND and XOR are accounted for. To see the results
# of an OR operation, enable *both* self.ands and self.xors together.
ands = Signal(xlen)
xors = Signal(xlen)
comb += [
ands.eq((a & b) & Repl(self.ands, xlen)),
xors.eq((a ^ b) & Repl(self.xors, xlen)),
]
# Magnitude comparison logic. Much of this logic was taken from
# studying how older CPUs implemented signed vs unsigned less-than
# comparisons (especially influenced by the 68000, IIRC).
#
# Note that it's not enough to just assert ltss or ltus enables.
# Those enables merely gates the output from the ALU. You must
# also configure the ALU inputs as if you were going to perform
# a subtraction as well! That means that cin must be 1 and the B
# input must be inverted.
ltss = Signal(xlen)
ltus = Signal(xlen)
comb += [
ltss.eq(Cat(sum_hi[0] ^ self.vout, Const(0, xlen-1)) & Repl(self.ltss, xlen)),
ltus.eq(Cat(~self.cout, Const(0, xlen-1)) & Repl(self.ltus, xlen)),
]
# Now we get to the big, bulky logic that implements the barrel shifter.
# If your code typically shifts by less than four bits at a time, and you
# want a faster processor, you might want to replace this logic with a
# state machine that shifts one bit at a time. That would probably
# reduce critical path length in the ALU, allowing the processor to run
# at a higher clock speed to compensate.
# Start with the left-shifts. We take the strategy of shifting using
# cascading sequences of MUXes. TODO: Refactor this logic somehow.
lsh1 = Signal(xlen)
lsh2 = Signal(xlen)
lsh4 = Signal(xlen)
lsh8 = Signal(xlen)
lsh16 = Signal(xlen)
lsh32 = Signal(xlen)
comb += [
lsh1.eq(Mux(b[0], Cat(Const(0, 1), a[0:xlen-1]), a)),
lsh2.eq(Mux(b[1], Cat(Const(0, 2), lsh1[0:xlen-2]), lsh1)),
lsh4.eq(Mux(b[2], Cat(Const(0, 4), lsh2[0:xlen-4]), lsh2)),
lsh8.eq(Mux(b[3], Cat(Const(0, 8), lsh4[0:xlen-8]), lsh4)),
lsh16.eq(Mux(b[4], Cat(Const(0, 16), lsh8[0:xlen-16]), lsh8)),
lsh32.eq(Mux(b[5], Cat(Const(0, 32), lsh16[0:xlen-32]), lsh16)),
]
lshs = Signal(xlen)
comb += lshs.eq(lsh32 & Repl(self.lshs, xlen))
# Next up, the infamous right-shift. This is complicated by the
# need to worry about sign-extension as we shift.
rsh1 = Signal(xlen)
rsh2 = Signal(xlen)
rsh4 = Signal(xlen)
rsh8 = Signal(xlen)
rsh16 = Signal(xlen)
rsh32 = Signal(xlen)
c1 = Signal(1)
c2 = Signal(2)
c4 = Signal(4)
c8 = Signal(8)
c16 = Signal(16)
c32 = Signal(32)
comb += [
c1.eq(a[xlen-1] & self.cin),
c2.eq(Repl(rsh1[xlen-1] & self.cin, 2)),
c4.eq(Repl(rsh2[xlen-1] & self.cin, 4)),
c8.eq(Repl(rsh4[xlen-1] & self.cin, 8)),
c16.eq(Repl(rsh8[xlen-1] & self.cin, 16)),
c32.eq(Repl(rsh16[xlen-1] & self.cin, 32)),
rsh1.eq(Mux(b[0], Cat(a[1:xlen], c1), a)),
rsh2.eq(Mux(b[1], Cat(rsh1[2:xlen], c2), rsh1)),
rsh4.eq(Mux(b[2], Cat(rsh2[4:xlen], c4), rsh2)),
rsh8.eq(Mux(b[3], Cat(rsh4[8:xlen], c8), rsh4)),
rsh16.eq(Mux(b[4], Cat(rsh8[16:xlen], c16), rsh8)),
rsh32.eq(Mux(b[5], Cat(rsh16[32:xlen], c32), rsh16)),
]
rshs = Signal(xlen)
comb += rshs.eq(rsh32 & Repl(self.rshs, xlen))
# Now we can drive our final output.
comb += self.out.eq(
sums | ands | xors | ltss | ltus | lshs | rshs
)
return m
class ALUFormal(Elaboratable):
def __init__(self):
super().__init__()
create_interface(self)
def elaborate(self, platform):
m = Module()
sync = m.d.sync
comb = m.d.comb
dut = ALU()
m.submodules.dut = dut
comb += [
dut.a.eq(self.a),
dut.b.eq(self.b),
dut.cin.eq(self.cin),
self.out.eq(dut.out),
self.cout.eq(dut.cout),
self.vout.eq(dut.vout),
self.zout.eq(dut.zout),
dut.sums.eq(self.sums),
dut.ands.eq(self.ands),
dut.xors.eq(self.xors),
dut.ltss.eq(self.ltss),
dut.ltus.eq(self.ltus),
dut.lshs.eq(self.lshs),
dut.rshs.eq(self.rshs),
dut.invb.eq(self.invb),
]
return m
class ALUTestCase(FHDLTestCase):
def test_alu_formally(self):
self.assertFormal(ALUFormal(), mode="bmc", depth=100)
self.assertFormal(ALUFormal(), mode="prove", depth=100)