Kestrel-3: Artifact [231dc2c288]

Artifact 231dc2c2884b58d673cfefc9f55c489ca335c12a058af9497f92295fd9b23aec:

File cores/cpu/alu.py — part of check-in [9357ad3b2f] at 2019-08-04 21:41:39 on branch trunk — Introduce (untested) ALU module, clone of KCP53000 ALU (user: kc5tja size: 8218)
#!/usr/bin/env python
#
# Implements the arithmetic/logic unit of the KCP53000 IXU.  Yes, I meant
# KCP53000, not KCP53000B.  This ALU is *big* and a bit *sluggish* (mainly
# because of its barrel shifter).  But, it *works*.
#
# Fun fact: This is perhaps my 2nd most re-used circuit to come out of the
# Kestrel Computer Project.  My 1st being the KIA, of course.
#
# The current logic assumes a 64-bit ALU, even though there is some
# parameterization in the construction of the module.  This will be cleaned up
# as I build out devices with other data widths.

from functools import reduce
import inspect

from nmigen.test.tools import FHDLTestCase
from nmigen.back.pysim import Simulator, Delay
from nmigen import (
    Signal, Module, ClockDomain, Elaboratable, ResetSignal, Const, Repl, Cat,
)
from nmigen.hdl.ast import Assume, Past, Assert, Stable, Rose, Fell


def create_interface(self, xlen=64):
    "Creates the ALU interface."

    # ALU inputs and output.  Generally, but not necessarily always, input A
    # corresponds to register 1, and B to register 2.

    self.a = Signal(xlen)
    self.b = Signal(xlen)
    self.out = Signal(xlen)

    # Status flags

    self.cin = Signal(1)
    self.cout = Signal(1)
    self.vout = Signal(1)
    self.zout = Signal(1)

    # Function Selects

    self.sums = Signal(1)
    self.ands = Signal(1)
    self.xors = Signal(1)
    self.invb = Signal(1)
    self.lshs = Signal(1)
    self.rshs = Signal(1)
    self.ltus = Signal(1)
    self.ltss = Signal(1)


class ALU(Elaboratable):
    """
    Implements the logic for the ALU.  The original Verilog for the KCP53000
    ALU can be found here:

    https://github.com/sam-falvo/kestrel/blob/master/cores/KCP53K/processor/rtl/verilog/alu.v
    """

    def __init__(self, formal=False):
        super().__init__()
        create_interface(self)

    def elaborate(self, platform):
        m = Module()
        comb = m.d.comb

        xlen = self.a.nbits

        a = self.a
        b = Signal(self.b.nbits)
        b.eq(self.b ^ Repl(self.invb, xlen))

        # This apparently very strange bit of logic is intended to reuse as
        # much as we can from the synthesis tools, while still gaining access
        # to the ultimate and penultimate carry signals, so that we can
        # properly compute the overflow flag setting.  This allows us to
        # implement SLT and SLTU instructions, for example.

        # Add bits 0 to xlen-1 into an xlen-length signal.  The top bit is the
        # carry that would have fed into that bit position.

        sum_lo = Signal(xlen)
        c_lo = Signal(1)

        comb += [
            sum_lo.eq(a[0:xlen-1] + b[0:xlen-1] + Cat(self.cin, Const(0, xlen-1))),
            c_lo.eq(sum_lo[-1]),
        ]

        # Add the top-most bits of our inputs, accounting for the carry-out
        # generated earlier.

        sum_hi = Signal(2)
        c_hi = Signal(1)

        comb += [
            sum_hi.eq(a[xlen-1] + b[xlen-1] + c_lo),
            c_hi.eq(sum_hi[1]),
        ]

        # The complete sum is the concatenation of sum_hi and sum_lo, masked
        # by the sums enable input.

        sums = Signal(xlen)
        comb += [
            sums.eq(Cat(sum_lo, sum_hi[0]) & Repl(self.sums, xlen)),
            self.zout.eq(sums == Const(0, xlen)),
            self.cout.eq(c_hi),
            self.vout.eq(c_hi ^ c_lo),
        ]

        # Boolean logic.  AND and XOR are accounted for.  To see the results
        # of an OR operation, enable *both* self.ands and self.xors together.

        ands = Signal(xlen)
        xors = Signal(xlen)

        comb += [
            ands.eq((a & b) & Repl(self.ands, xlen)),
            xors.eq((a ^ b) & Repl(self.xors, xlen)),
        ]

        # Magnitude comparison logic.  Much of this logic was taken from
        # studying how older CPUs implemented signed vs unsigned less-than
        # comparisons (especially influenced by the 68000, IIRC).

        ltss = Signal(xlen)
        ltus = Signal(xlen)

        comb += [
            ltss.eq(Cat(sums[xlen-1] ^ self.vout, Const(0, xlen-1)) & Repl(self.ltss, xlen)),
            ltus.eq(Cat(~self.cout, Const(0, xlen-1)) & Repl(self.ltus, xlen)),
        ]

        # Now we get to the big, bulky logic that implements the barrel shifter.
        # If your code typically shifts by less than four bits at a time, and you
        # want a faster processor, you might want to replace this logic with a
        # state machine that shifts one bit at a time.  That would probably
        # reduce critical path length in the ALU, allowing the processor to run
        # at a higher clock speed to compensate.

        # Start with the left-shifts.  We take the strategy of shifting using
        # cascading sequences of MUXes.  TODO: Refactor this logic somehow.

        lsh1 = Signal(xlen)
        with m.Switch(self.b[0]):
            with m.Case(Const(0)):
                comb += lsh1.eq(a)
            with m.Case(Const(1)):
                comb += lsh1.eq(Cat(Const(0, 1), a[0:xlen-1]))

        lsh2 = Signal(xlen)
        with m.Switch(self.b[1]):
            with m.Case(Const(0)):
                comb += lsh2.eq(lsh1)
            with m.Case(Const(1)):
                comb += lsh2.eq(Cat(Const(0, 2), lsh1[0:xlen-2]))

        lsh4 = Signal(xlen)
        with m.Switch(self.b[2]):
            with m.Case(Const(0)):
                comb += lsh4.eq(lsh2)
            with m.Case(Const(1)):
                comb += lsh4.eq(Cat(Const(0, 4), lsh2[0:xlen-4]))

        lsh8 = Signal(xlen)
        with m.Switch(self.b[3]):
            with m.Case(Const(0)):
                comb += lsh8.eq(lsh4)
            with m.Case(Const(1)):
                comb += lsh8.eq(Cat(Const(0, 8), lsh4[0:xlen-8]))

        lsh16 = Signal(xlen)
        with m.Switch(self.b[4]):
            with m.Case(Const(0)):
                comb += lsh16.eq(a)
            with m.Case(Const(1)):
                comb += lsh16.eq(Cat(Const(0, 16), lsh8[0:xlen-16]))

        lsh32 = Signal(xlen)
        with m.Switch(self.b[5]):
            with m.Case(Const(0)):
                comb += lsh32.eq(lsh16)
            with m.Case(Const(1)):
                comb += lsh32.eq(Cat(Const(0, 32), lsh16[0:xlen-32]))

        lshs = Signal(xlen)
        comb += lshs.eq(lsh32 & Repl(self.lshs, 64))

        # Next up, the infamous right-shift.  This is complicated by the
        # need to worry about sign-extension as we shift.

        rsh1 = Signal(xlen)
        with m.Switch(self.b[0]):
            with m.Case(Const(0)):
                comb += rsh1.eq(a)
            with m.Case(Const(1)):
                comb += rsh1.eq(Cat(a[1:xlen], a[xlen-1] & self.cin))

        rsh2 = Signal(xlen)
        with m.Switch(self.b[1]):
            with m.Case(Const(0)):
                comb += rsh2.eq(rsh1)
            with m.Case(Const(1)):
                comb += rsh2.eq(Cat(rsh1[2:xlen], Repl(rsh1[xlen-1] & self.cin, 2)))

        rsh4 = Signal(xlen)
        with m.Switch(self.b[2]):
            with m.Case(Const(0)):
                comb += rsh4.eq(rsh2)
            with m.Case(Const(1)):
                comb += rsh4.eq(Cat(rsh2[4:xlen], Repl(rsh2[xlen-1] & self.cin, 4)))

        rsh8 = Signal(xlen)
        with m.Switch(self.b[3]):
            with m.Case(Const(0)):
                comb += rsh8.eq(rsh4)
            with m.Case(Const(1)):
                comb += rsh8.eq(Cat(rsh4[8:xlen], Repl(rsh4[xlen-1] & self.cin, 8)))

        rsh16 = Signal(xlen)
        with m.Switch(self.b[4]):
            with m.Case(Const(0)):
                comb += rsh16.eq(rsh8)
            with m.Case(Const(1)):
                comb += rsh16.eq(Cat(rsh8[16:xlen], Repl(rsh8[xlen-1] & self.cin, 16)))

        rsh32 = Signal(xlen)
        with m.Switch(self.b[5]):
            with m.Case(Const(0)):
                comb += rsh32.eq(rsh16)
            with m.Case(Const(1)):
                comb += rsh32.eq(Cat(rsh16[32:xlen], Repl(rsh16[xlen-1] & self.cin, 32)))

        rshs = Signal(xlen)
        comb += rshs.eq(rsh32 & Repl(self.rshs, 64))

        # Now we can drive our final output.

        comb += self.out.eq(
            sums | ands | xors | ltss | ltus | lshs | rshs
        )