Kestrel-3: Artifact [4e5c7af7d7]

Artifact 4e5c7af7d706e9be2650ad734b6bce192adfad51de55bd04d555b071cdbaf73e:

File cores/cpu/alu.py — part of check-in [b705e135a7] at 2019-08-05 03:44:15 on branch trunk — SRLI and SRAI -- this completes OP-IMM (user: kc5tja size: 8356)
#!/usr/bin/env python
#
# Implements the arithmetic/logic unit of the KCP53000 IXU.  Yes, I meant
# KCP53000, not KCP53000B.  This ALU is *big* and a bit *sluggish* (mainly
# because of its barrel shifter).  But, it *works*.
#
# Fun fact: This is perhaps my 2nd most re-used circuit to come out of the
# Kestrel Computer Project.  My 1st being the KIA, of course.
#
# The current logic assumes a 64-bit ALU, even though there is some
# parameterization in the construction of the module.  This will be cleaned up
# as I build out devices with other data widths.

from functools import reduce
import inspect

from nmigen.test.tools import FHDLTestCase
from nmigen.back.pysim import Simulator, Delay
from nmigen import (
    Signal, Module, ClockDomain, Elaboratable, ResetSignal, Const, Repl, Cat,
)
from nmigen.hdl.ast import Assume, Past, Assert, Stable, Rose, Fell, Mux


def create_interface(self, xlen=64):
    "Creates the ALU interface."

    # ALU inputs and output.  Generally, but not necessarily always, input A
    # corresponds to register 1, and B to register 2.

    self.a = Signal(xlen)
    self.b = Signal(xlen)
    self.out = Signal(xlen)

    # Status flags

    self.cin = Signal(1)
    self.cout = Signal(1)
    self.vout = Signal(1)
    self.zout = Signal(1)

    # Function Selects

    self.sums = Signal(1)
    self.ands = Signal(1)
    self.xors = Signal(1)
    self.invb = Signal(1)
    self.lshs = Signal(1)
    self.rshs = Signal(1)
    self.ltus = Signal(1)
    self.ltss = Signal(1)


class ALU(Elaboratable):
    """
    Implements the logic for the ALU.  The original Verilog for the KCP53000
    ALU can be found here:

    https://github.com/sam-falvo/kestrel/blob/master/cores/KCP53K/processor/rtl/verilog/alu.v
    """

    def __init__(self, xlen=64, formal=False):
        super().__init__()
        create_interface(self, xlen=xlen)

    def elaborate(self, platform):
        m = Module()
        comb = m.d.comb

        xlen = self.a.nbits

        a = self.a
        b = Signal(self.b.nbits)
        comb += b.eq(self.b ^ Repl(self.invb, xlen))

        # This apparently very strange bit of logic is intended to reuse as
        # much as we can from the synthesis tools, while still gaining access
        # to the ultimate and penultimate carry signals, so that we can
        # properly compute the overflow flag setting.  This allows us to
        # implement SLT and SLTU instructions, for example.

        # Add bits 0 to xlen-1 into an xlen-length signal.  The top bit is the
        # carry that would have fed into that bit position.

        sum_lo = Signal(xlen)
        c_lo = Signal(1)

        comb += [
            sum_lo.eq(a[0:xlen-1] + b[0:xlen-1] + Cat(self.cin, Const(0, xlen-1))),
            c_lo.eq(sum_lo[xlen-1]),
        ]

        # Add the top-most bits of our inputs, accounting for the carry-out
        # generated earlier.

        sum_hi = Signal(2)
        c_hi = Signal(1)

        comb += [
            sum_hi.eq(a[xlen-1] + b[xlen-1] + c_lo),
            c_hi.eq(sum_hi[1]),
        ]

        # The complete sum is the concatenation of sum_hi and sum_lo, masked
        # by the sums enable input.

        sums = Signal(xlen)
        comb += [
            sums.eq(Cat(sum_lo[0:xlen-1], sum_hi[0]) & Repl(self.sums, xlen)),
            self.zout.eq(sums == Const(0, xlen)),
            self.cout.eq(c_hi),
            self.vout.eq(c_hi ^ c_lo),
        ]

        # Boolean logic.  AND and XOR are accounted for.  To see the results
        # of an OR operation, enable *both* self.ands and self.xors together.

        ands = Signal(xlen)
        xors = Signal(xlen)

        comb += [
            ands.eq((a & b) & Repl(self.ands, xlen)),
            xors.eq((a ^ b) & Repl(self.xors, xlen)),
        ]

        # Magnitude comparison logic.  Much of this logic was taken from
        # studying how older CPUs implemented signed vs unsigned less-than
        # comparisons (especially influenced by the 68000, IIRC).
        #
        # Note that it's not enough to just assert ltss or ltus enables.
        # Those enables merely gates the output from the ALU.  You must
        # also configure the ALU inputs as if you were going to perform
        # a subtraction as well!  That means that cin must be 1 and the B
        # input must be inverted.

        ltss = Signal(xlen)
        ltus = Signal(xlen)

        comb += [
            ltss.eq(Cat(sum_hi[0] ^ self.vout, Const(0, xlen-1)) & Repl(self.ltss, xlen)),
            ltus.eq(Cat(~self.cout, Const(0, xlen-1)) & Repl(self.ltus, xlen)),
        ]

        # Now we get to the big, bulky logic that implements the barrel shifter.
        # If your code typically shifts by less than four bits at a time, and you
        # want a faster processor, you might want to replace this logic with a
        # state machine that shifts one bit at a time.  That would probably
        # reduce critical path length in the ALU, allowing the processor to run
        # at a higher clock speed to compensate.

        # Start with the left-shifts.  We take the strategy of shifting using
        # cascading sequences of MUXes.  TODO: Refactor this logic somehow.

        lsh1 = Signal(xlen)
        lsh2 = Signal(xlen)
        lsh4 = Signal(xlen)
        lsh8 = Signal(xlen)
        lsh16 = Signal(xlen)
        lsh32 = Signal(xlen)

        comb += [
            lsh1.eq(Mux(b[0], Cat(Const(0, 1), a[0:xlen-1]), a)),
            lsh2.eq(Mux(b[1], Cat(Const(0, 2), lsh1[0:xlen-2]), lsh1)),
            lsh4.eq(Mux(b[2], Cat(Const(0, 4), lsh2[0:xlen-4]), lsh2)),
            lsh8.eq(Mux(b[3], Cat(Const(0, 8), lsh4[0:xlen-8]), lsh4)),
            lsh16.eq(Mux(b[4], Cat(Const(0, 16), lsh8[0:xlen-16]), lsh8)),
            lsh32.eq(Mux(b[5], Cat(Const(0, 32), lsh16[0:xlen-32]), lsh16)),
        ]

        lshs = Signal(xlen)
        comb += lshs.eq(lsh32 & Repl(self.lshs, xlen))

        # Next up, the infamous right-shift.  This is complicated by the
        # need to worry about sign-extension as we shift.

        rsh1 = Signal(xlen)
        rsh2 = Signal(xlen)
        rsh4 = Signal(xlen)
        rsh8 = Signal(xlen)
        rsh16 = Signal(xlen)
        rsh32 = Signal(xlen)

        c1 = Signal(1)
        c2 = Signal(2)
        c4 = Signal(4)
        c8 = Signal(8)
        c16 = Signal(16)
        c32 = Signal(32)

        comb += [
            c1.eq(a[xlen-1] & self.cin),
            c2.eq(Repl(rsh1[xlen-1] & self.cin, 2)),
            c4.eq(Repl(rsh2[xlen-1] & self.cin, 4)),
            c8.eq(Repl(rsh4[xlen-1] & self.cin, 8)),
            c16.eq(Repl(rsh8[xlen-1] & self.cin, 16)),
            c32.eq(Repl(rsh16[xlen-1] & self.cin, 32)),

            rsh1.eq(Mux(b[0], Cat(a[1:xlen], c1), a)),
            rsh2.eq(Mux(b[1], Cat(rsh1[2:xlen], c2), rsh1)),
            rsh4.eq(Mux(b[2], Cat(rsh2[4:xlen], c4), rsh2)),
            rsh8.eq(Mux(b[3], Cat(rsh4[8:xlen], c8), rsh4)),
            rsh16.eq(Mux(b[4], Cat(rsh8[16:xlen], c16), rsh8)),
            rsh32.eq(Mux(b[5], Cat(rsh16[32:xlen], c32), rsh16)),
        ]

        rshs = Signal(xlen)
        comb += rshs.eq(rsh32 & Repl(self.rshs, xlen))

        # Now we can drive our final output.

        comb += self.out.eq(
            sums | ands | xors | ltss | ltus | lshs | rshs
        )

        return m


class ALUFormal(Elaboratable):
    def __init__(self):
        super().__init__()
        create_interface(self)

    def elaborate(self, platform):
        m = Module()
        sync = m.d.sync
        comb = m.d.comb

        dut = ALU()
        m.submodules.dut = dut
        comb += [
            dut.a.eq(self.a),
            dut.b.eq(self.b),
            dut.cin.eq(self.cin),
            self.out.eq(dut.out),
            self.cout.eq(dut.cout),
            self.vout.eq(dut.vout),
            self.zout.eq(dut.zout),
            dut.sums.eq(self.sums),
            dut.ands.eq(self.ands),
            dut.xors.eq(self.xors),
            dut.ltss.eq(self.ltss),
            dut.ltus.eq(self.ltus),
            dut.lshs.eq(self.lshs),
            dut.rshs.eq(self.rshs),
            dut.invb.eq(self.invb),
        ]

        return m


class ALUTestCase(FHDLTestCase):

    def test_alu_formally(self):
        self.assertFormal(ALUFormal(), mode="bmc", depth=100)
        self.assertFormal(ALUFormal(), mode="prove", depth=100)