From cc1c68f246fc7c03d20e9dcef23bce555fd4affa Mon Sep 17 00:00:00 2001 From: viv-eth Date: Sat, 12 Oct 2024 07:52:25 +0200 Subject: [PATCH] [bump] Adjust Snitch cluster for TCASAI kernels --- sw/apps/ata/Makefile | 43 ++++ sw/apps/ata/data/data.h | 83 ++++++ sw/apps/ata/data/datagen.py | 75 ++++++ sw/apps/ata/data/params.hjson | 10 + sw/apps/ata/src/args.h | 19 ++ sw/apps/ata/src/ata.h | 277 +++++++++++++++++++++ sw/apps/ata/src/main.c | 17 ++ sw/blas/axpy/Makefile | 5 +- sw/blas/axpy/data/datagen.py | 12 +- sw/blas/gemm/data/params.hjson | 6 +- sw/snRuntime/src/dma.h | 66 +++++ target/common/common.mk | 4 +- target/snitch_cluster/Makefile | 2 +- target/snitch_cluster/cfg/default.hjson | 33 ++- target/snitch_cluster/sw/apps/Makefile | 1 + target/snitch_cluster/sw/apps/ata/Makefile | 10 + target/snitch_cluster/test/testharness.sv | 21 +- util/sim/data_utils.py | 258 +++++++++++++++++++ 18 files changed, 919 insertions(+), 23 deletions(-) create mode 100644 sw/apps/ata/Makefile create mode 100644 sw/apps/ata/data/data.h create mode 100755 sw/apps/ata/data/datagen.py create mode 100644 sw/apps/ata/data/params.hjson create mode 100644 sw/apps/ata/src/args.h create mode 100644 sw/apps/ata/src/ata.h create mode 100644 sw/apps/ata/src/main.c create mode 100644 target/snitch_cluster/sw/apps/ata/Makefile diff --git a/sw/apps/ata/Makefile b/sw/apps/ata/Makefile new file mode 100644 index 000000000..926d782bb --- /dev/null +++ b/sw/apps/ata/Makefile @@ -0,0 +1,43 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +# Usage of absolute paths is required to externally include this Makefile +MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) +DATA_DIR := $(realpath $(MK_DIR)/data) +SRC_DIR := $(realpath $(MK_DIR)/src) + +SECTION ?= + +APP ?= ata +SRCS ?= $(SRC_DIR)/main.c +INCDIRS += $(DATA_DIR) $(SRC_DIR) + +DATAGEN_PY = $(DATA_DIR)/datagen.py +DATA_H = $(DATA_DIR)/data.h +DATA_CFG ?= $(DATA_DIR)/params.hjson + +$(DATA_H): $(DATAGEN_PY) $(DATA_CFG) + $< -c $(DATA_CFG) --section="$(SECTION)" $@ + +.PHONY: clean-data clean + +clean-data: + rm -f $(DATA_H) + +clean: clean-data + +debug_app: + @echo "MK_DIR: $(MK_DIR)" + @echo "DATA_DIR: $(DATA_DIR)" + @echo "SRC_DIR: $(SRC_DIR)" + @echo "SECTION: $(SECTION)" + @echo "APP: $(APP)" + @echo "SRCS: $(SRCS)" + @echo "INCDIRS: $(INCDIRS)" + @echo "DATAGEN_PY: $(DATAGEN_PY)" + @echo "DATA_H: $(DATA_H)" + @echo "DATA_CFG: $(DATA_CFG)" + @echo "$(DATAGEN_PY) -c $(DATA_CFG) --section=\"$(SECTION)\" $(DATA_H)" \ No newline at end of file diff --git a/sw/apps/ata/data/data.h b/sw/apps/ata/data/data.h new file mode 100644 index 000000000..c6a182b4f --- /dev/null +++ b/sw/apps/ata/data/data.h @@ -0,0 +1,83 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + + + +double A[64] = { + -0.36, + -0.14, + -0.38, + -1.96, + -1.34, + -1.45, + 0.61, + -0.34, + -0.63, + -1.98, + -1.19, + 0.64, + -1.38, + 0.65, + 0.13, + 0.91, + -1.79, + -1.08, + 0.92, + -1.25, + 0.18, + 0.66, + -0.59, + -1.98, + 0.87, + 0.64, + -1.05, + -0.56, + -0.54, + -0.49, + -1.74, + 0.69, + -0.87, + -0.6, + -0.59, + -0.48, + 0.68, + 0.14, + -1.85, + -1.57, + -0.03, + 0.13, + 0.34, + -1.2, + -0.83, + -1.26, + -0.15, + 0.79, + -0.44, + 0.14, + -1.26, + -0.81, + -0.86, + 0.74, + -0.49, + -0.07, + 0.49, + -0.15, + -0.94, + 0.52, + 0.33, + -0.04, + 0.31, + 0.37, +}; + +double B[256]; + +ata_args_t args = { + .m = 16, + .n = 4, + .a = A, + .b = B, + .m_tiles = 2, + .funcptr = ata_opt +}; \ No newline at end of file diff --git a/sw/apps/ata/data/datagen.py b/sw/apps/ata/data/datagen.py new file mode 100755 index 000000000..02cb76254 --- /dev/null +++ b/sw/apps/ata/data/datagen.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Author: Luca Colagrande + +import numpy as np +import os +import sys + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) + +from data_utils import format_array_definition, format_array_declaration, \ + format_struct_definition, DataGen, validate_tcdm_footprint + + +DOUBLE_BUFFER = True + +class AtaDataGen(DataGen): + + # Function pointers to alternative implementations + FUNCPTRS = ["ata_baseline", "ata_opt"] + + def golden_model(self, A): + return np.matmul(A, A.transpose()) + + def validate(self, **kwargs): + assert (kwargs['m'] % kwargs['m_tiles']) == 0, "m must be an integer multiple of m_tiles" + m_frac = kwargs['m'] / kwargs['m_tiles'] + assert (m_frac % 8) == 0, "m_frac must be an integer multiple of the number of cores" + assert (m_frac % 4) == 0, "m_frac must be an integer multiple of the unroll factor 4" + assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}" + + # Calculate total TCDM occupation + a_tile_size = m_frac * kwargs['n'] * 8 + b_tile_size = m_frac * m_frac * 8 + total_size = 2 * a_tile_size + b_tile_size + if DOUBLE_BUFFER: + total_size *= 2 + validate_tcdm_footprint(total_size) + + def emit_header(self, **kwargs): + header = [super().emit_header()] + + self.validate(**kwargs) + + A = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['n']))/100 + B = self.golden_model(A) + + A = A.flatten() + B = B.flatten() + + A_uid = 'A' + B_uid = 'B' + + cfg = { + 'm': kwargs['m'], + 'n': kwargs['n'], + 'a': A_uid, + 'b': B_uid, + 'm_tiles': kwargs['m_tiles'], + 'funcptr': kwargs['funcptr'] + } + + header += [format_array_definition('double', A_uid, A)] + header += [format_array_declaration('double', B_uid, B.shape)] + header += [format_struct_definition('ata_args_t', 'args', cfg)] + header = '\n\n'.join(header) + + return header + + +if __name__ == '__main__': + AtaDataGen().main() diff --git a/sw/apps/ata/data/params.hjson b/sw/apps/ata/data/params.hjson new file mode 100644 index 000000000..1db35db08 --- /dev/null +++ b/sw/apps/ata/data/params.hjson @@ -0,0 +1,10 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + "m": 16, + "n": 4, + "m_tiles": 2, + "funcptr": "ata_opt" +} diff --git a/sw/apps/ata/src/args.h b/sw/apps/ata/src/args.h new file mode 100644 index 000000000..520693e22 --- /dev/null +++ b/sw/apps/ata/src/args.h @@ -0,0 +1,19 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#pragma once +#include + +typedef void (*ata_fp_t)(uint32_t m, uint32_t n, double *a, double *at,double *b); + +typedef struct { + uint32_t m; + uint32_t n; + double *a; + double *b; + uint32_t m_tiles; + ata_fp_t funcptr; +} ata_args_t; diff --git a/sw/apps/ata/src/ata.h b/sw/apps/ata/src/ata.h new file mode 100644 index 000000000..0e33ea5ff --- /dev/null +++ b/sw/apps/ata/src/ata.h @@ -0,0 +1,277 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include "args.h" +#include "snrt.h" + +#define DOUBLE_BUFFER 1 + +__thread int setup_ssr = 1; + +void ata_naive(uint32_t m, uint32_t n, double *a, double *at, double *b) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + for (uint32_t i = offset; i < m; i += stride) { + for (uint32_t j = 0; j < m; j++) { + b[i * m + j] = 0; + for (uint32_t k = 0; k < n; k++) { + b[i * m + j] += a[i * n + k] * at[j * n + k]; + } + } + } +} + +void ata_baseline(uint32_t m, uint32_t n, double *a, double *at, double *b) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + // Unrolling factor of innermost loop + // Note: changes must be reflected in the inline assembly code + // and datagen script + const uint32_t unroll = 8; + + for (uint32_t i = offset; i < m; i += stride) { + for (uint32_t j = 0; j < m; j++) { + + double acc = 0; + + for (uint32_t k = 0; k < n; k += unroll) { + asm volatile( + "fmadd.d %[acc], %[a0], %[at0], %[acc] \n" + "fmadd.d %[acc], %[a1], %[at1], %[acc] \n" + "fmadd.d %[acc], %[a2], %[at2], %[acc] \n" + "fmadd.d %[acc], %[a3], %[at3], %[acc] \n" + "fmadd.d %[acc], %[a4], %[at4], %[acc] \n" + "fmadd.d %[acc], %[a5], %[at5], %[acc] \n" + "fmadd.d %[acc], %[a6], %[at6], %[acc] \n" + "fmadd.d %[acc], %[a7], %[at7], %[acc] \n" + : [ acc ] "+f"(acc) + : [ a0 ] "f"(a[i * n + k + 0]), [ a1 ] "f"(a[i * n + k + 1]), + [ a2 ] "f"(a[i * n + k + 2]), [ a3 ] "f"(a[i * n + k + 3]), + [ a4 ] "f"(a[i * n + k + 4]), [ a5 ] "f"(a[i * n + k + 5]), + [ a6 ] "f"(a[i * n + k + 6]), [ a7 ] "f"(a[i * n + k + 7]), + [ at0 ] "f"(at[j * n + k + 0]), [ at1 ] "f"(at[j * n + k + 1]), + [ at2 ] "f"(at[j * n + k + 2]), [ at3 ] "f"(at[j * n + k + 3]), + [ at4 ] "f"(at[j * n + k + 4]), [ at5 ] "f"(at[j * n + k + 5]), + [ at6 ] "f"(at[j * n + k + 6]), [ at7 ] "f"(at[j * n + k + 7]) + : + ); + } + + b[i * m + j] = acc; + } + } +} + +void ata_opt(uint32_t m, uint32_t n, double *a, double *at, double *b) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + // Unrolling factor of innermost loop + // Note: changes must be reflected in the inline assembly code + // and datagen script + const uint32_t unroll = 4; + + if (setup_ssr) { + // Configure ft0 and ft1 to load A and At + // for (i = offset; i < m; i += stride) + // for (j1 = 0; j1 < m; j1 += unroll) + // for (k = 0; k < n; k++) + // for (j0 = 0; j0 < unroll; j0++) + // j = j1 + j0 + // ft0.push(a[i * n + k]) + // ft1.push(at[j * n + k]) + const uint32_t ssr0_b[4] = {unroll, n, m / unroll, m / stride}; + const uint32_t ssr0_i[4] = {0, sizeof(double), 0, stride * n * sizeof(double)}; + snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], + ssr0_i[1], ssr0_i[2], ssr0_i[3]); + snrt_ssr_repeat(SNRT_SSR_DM0, unroll); + const uint32_t ssr1_b[4] = {unroll, n, m / unroll, m / stride}; + const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), unroll * n * sizeof(double), 0}; + snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], + ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2], + ssr1_i[3]); + setup_ssr = 0; + } + + // SSR start address need to be configured each time + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, a + offset * n); + snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, at); + snrt_ssr_enable(); + + for (uint32_t i = offset; i < m; i += stride) { + for (uint32_t j = 0; j < m; j += unroll) { + + double acc[unroll]; + acc[0] = 0; + acc[1] = 0; + acc[2] = 0; + acc[3] = 0; + + asm volatile( + "frep.o %[n_frep], %[unroll], 0, 0 \n" + "fmadd.d %[b0], ft0, ft1, %[b0] \n" + "fmadd.d %[b1], ft0, ft1, %[b1] \n" + "fmadd.d %[b2], ft0, ft1, %[b2] \n" + "fmadd.d %[b3], ft0, ft1, %[b3] \n" + : [ b0 ] "+f"(acc[0]), [ b1 ] "+f"(acc[1]), + [ b2 ] "+f"(acc[2]), [ b3 ] "+f"(acc[3]) + : [ n_frep ] "r"(n - 1), [ unroll ] "i"(unroll) + : "ft0", "ft1", "ft2"); + + b[i * m + j + 0] = acc[0]; + b[i * m + j + 1] = acc[1]; + b[i * m + j + 2] = acc[2]; + b[i * m + j + 3] = acc[3]; + } + } + + snrt_ssr_disable(); + snrt_fpu_fence(); +} + +void ata_job(ata_args_t *args) { + uint32_t m_frac, a_tile_size, a_tile_bytes, b_tile_size, b_tile_bytes; + uint64_t local_a0_addr, local_at0_addr, local_b0_addr, + local_a1_addr, local_at1_addr, local_b1_addr; + double *local_a[2]; + double *local_at[2]; + double *local_b[2]; + uint32_t iterations, sb_iterations; + uint32_t i, i_dma_in, i_compute, i_dma_out, i_row, i_col, buff_idx; + +#ifndef JOB_ARGS_PRELOADED + // Allocate space for job arguments in TCDM + ata_args_t *local_args = (ata_args_t *)snrt_l1_next(); + + // Copy job arguments to TCDM + if (snrt_is_dm_core()) { + snrt_dma_start_1d(local_args, args, sizeof(ata_args_t)); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); + args = local_args; +#endif + + // Calculate size of each tile + m_frac = args->m / args->m_tiles; + a_tile_size = args->n * m_frac; + b_tile_size = m_frac * m_frac; + a_tile_bytes = a_tile_size * sizeof(double); + b_tile_bytes = b_tile_size * sizeof(double); + + // Allocate space for job operands in TCDM + // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th. + local_a0_addr = (uint64_t)args + sizeof(ata_args_t); + local_at0_addr = local_a0_addr + a_tile_bytes; + local_b0_addr = local_at0_addr + a_tile_bytes; + local_a[0] = (double *)local_a0_addr; + local_at[0] = (double *)local_at0_addr; + local_b[0] = (double *)local_b0_addr; + if (DOUBLE_BUFFER) { + local_a1_addr = local_b0_addr + b_tile_bytes; + local_at1_addr = local_a1_addr + a_tile_bytes; + local_b1_addr = local_at1_addr + a_tile_bytes; + local_a[1] = (double *)local_a1_addr; + local_at[1] = (double *)local_at1_addr; + local_b[1] = (double *)local_b1_addr; + } + + // Calculate number of iterations + sb_iterations = args->m_tiles * args->m_tiles; + if (DOUBLE_BUFFER) iterations = sb_iterations + 2; + else iterations = sb_iterations; + + // Iterate over all tiles + for (i = 0; i < iterations; i++) { + + if (snrt_is_dm_core()) { + // DMA in + if (!DOUBLE_BUFFER || (i < sb_iterations)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_in = i; + buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0; + i_row = i_dma_in / args->m_tiles; + i_col = i_dma_in % args->m_tiles; + + // Copy job operands in TCDM + snrt_dma_load_1d_tile( + local_a[buff_idx], + args->a, + i_row, + a_tile_size, + sizeof(double)); + snrt_dma_load_1d_tile( + local_at[buff_idx], + args->a, + i_col, + a_tile_size, + sizeof(double)); + snrt_dma_wait_all(); + + snrt_mcycle(); + } + + // Additional barriers required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + // DMA out + if (!DOUBLE_BUFFER || (i > 1)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_out = DOUBLE_BUFFER ? i - 2 : i; + buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0; + i_row = i_dma_out / args->m_tiles; + i_col = i_dma_out % args->m_tiles; + + // Copy job outputs from TCDM + snrt_dma_store_2d_tile( + args->b, + local_b[buff_idx], + i_row, + i_col, + m_frac, + m_frac, + args->m, + sizeof(double)); + snrt_dma_wait_all(); + + snrt_mcycle(); + } + } + + // Compute + if (snrt_is_compute_core()) { + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1))) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_compute = DOUBLE_BUFFER ? i - 1 : i; + buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0; + + // Perform tile computation + ata_fp_t fp = args->funcptr; + fp(m_frac, args->n, local_a[buff_idx], + local_at[buff_idx], local_b[buff_idx]); + + snrt_mcycle(); + } + + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + } + // Synchronize cores after every iteration + snrt_cluster_hw_barrier(); + } +} diff --git a/sw/apps/ata/src/main.c b/sw/apps/ata/src/main.c new file mode 100644 index 000000000..c8df4bea9 --- /dev/null +++ b/sw/apps/ata/src/main.c @@ -0,0 +1,17 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include "snrt.h" + +#include "ata.h" +#include "data.h" + +int main() { + + ata_job(&args); + + return 0; +} diff --git a/sw/blas/axpy/Makefile b/sw/blas/axpy/Makefile index bed4edaa8..df31b086c 100644 --- a/sw/blas/axpy/Makefile +++ b/sw/blas/axpy/Makefile @@ -22,9 +22,12 @@ DATA_H = $(DATA_DIR)/data.h $(DATA_H): $(DATAGEN_PY) $< $(LENGTH) --section="$(SECTION)" > $@ -.PHONY: clean-data clean +.PHONY: clean-data clean debug clean-data: rm -f $(DATA_H) clean: clean-data + +debug_app: + @echo "MK_DIR: $(MK_DIR)" diff --git a/sw/blas/axpy/data/datagen.py b/sw/blas/axpy/data/datagen.py index f7ae7a648..e6e732e38 100755 --- a/sw/blas/axpy/data/datagen.py +++ b/sw/blas/axpy/data/datagen.py @@ -12,10 +12,8 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) from data_utils import format_scalar_definition, format_vector_definition, \ - format_vector_declaration, format_ifdef_wrapper # noqa: E402 - -MIN = -1000 -MAX = +1000 + format_vector_declaration, format_ifdef_wrapper, \ + generate_random_array # Aligns data to the size of a beat to avoid misaligned transfers BEAT_ALIGNMENT = 64 @@ -44,9 +42,9 @@ def main(): section = args.section # Randomly generate inputs - a = np.random.uniform(MIN, MAX, 1) - x = np.random.uniform(MIN, MAX, length) - y = np.random.uniform(MIN, MAX, length) + a = generate_random_array(1)[0] + x = generate_random_array(length)[0] + y = generate_random_array(length)[0] z = np.zeros(length) g = golden_model(a, x, y) diff --git a/sw/blas/gemm/data/params.hjson b/sw/blas/gemm/data/params.hjson index 7b3ab59b9..ac174f128 100644 --- a/sw/blas/gemm/data/params.hjson +++ b/sw/blas/gemm/data/params.hjson @@ -5,9 +5,9 @@ // Parameters for a GEMM { - M: 192, - N: 16, - K: 16, + M: 16, + N: 64, + K: 128, beta: 0, ta: false, tb: true, // must be true for SIMD diff --git a/sw/snRuntime/src/dma.h b/sw/snRuntime/src/dma.h index 169e54d7b..bfa3f6436 100644 --- a/sw/snRuntime/src/dma.h +++ b/sw/snRuntime/src/dma.h @@ -205,3 +205,69 @@ inline void snrt_dma_memset(void *ptr, uint8_t value, uint32_t len) { snrt_dma_start_2d(ptr, ptr, 64, 64, 0, len / 64); snrt_dma_wait_all(); } + +/// Load a 1D-tile of size tile_size from a 1D array. The specific tile is +/// selected by tile_idx. Every element in the src and dst arrays has prec +/// bytes. +inline snrt_dma_txid_t snrt_dma_load_1d_tile(void *dst, void *src, + size_t tile_idx, size_t tile_size, + uint32_t prec) { + size_t tile_nbytes = tile_size * prec; + return snrt_dma_start_1d(dst, src + tile_idx * tile_nbytes, tile_nbytes); +} + +/// Store a 1D-tile of size tile_size to a 1D array. The specific tile is +/// selected by tile_idx. Every element in the src and dst arrays has prec +/// bytes. +inline snrt_dma_txid_t snrt_dma_store_1d_tile(void *dst, void *src, + size_t tile_idx, size_t tile_size, + uint32_t prec) { + size_t tile_nbytes = tile_size * prec; + return snrt_dma_start_1d(dst + tile_idx * tile_nbytes, src, tile_nbytes); +} + +/// Load a 2D-tile of shape (tile_x1_size, tile_x0_size) from the 2D array +/// of shape (full_x1_size, full_x0_size). The specific tile is selected +/// by the (tile_x1_idx, tile_x0_idx) tuple. Every element in the src and +/// destination arrays has prec bytes. +inline snrt_dma_txid_t snrt_dma_load_2d_tile( + void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, + size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, + uint32_t prec) { + size_t src_offset = 0; + // Advance src array in x0 and x1 dimensions, and convert to byte offset + src_offset += tile_x0_idx * tile_x0_size; + src_offset += tile_x1_idx * tile_x1_size * full_x0_size; + src_offset *= prec; + // Initiate transfer + return snrt_dma_start_2d(dst, // dst + src + src_offset, // src + tile_x0_size * prec, // size + tile_x0_size * prec, // dst_stride + full_x0_size * prec, // src_stride + tile_x1_size // repeat + ); +} + +/// Store a 2D-tile of shape (tile_x1_size, tile_x0_size) to the 2D array +/// of shape (full_x1_size, full_x0_size). The specific tile is selected +/// by the (tile_x1_idx, tile_x0_idx) tuple. Every element in the src and +/// destination arrays has prec bytes. +inline snrt_dma_txid_t snrt_dma_store_2d_tile( + void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, + size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, + uint32_t prec) { + size_t dst_offset = 0; + // Advance dst array in x0 and x1 dimensions, and convert to byte offset + dst_offset += tile_x0_idx * tile_x0_size; + dst_offset += tile_x1_idx * tile_x1_size * full_x0_size; + dst_offset *= prec; + // Initiate transfer + return snrt_dma_start_2d(dst + dst_offset, // dst + src, // src + tile_x0_size * prec, // size + full_x0_size * prec, // dst_stride + tile_x0_size * prec, // src_stride + tile_x1_size // repeat + ); +} diff --git a/target/common/common.mk b/target/common/common.mk index 48b875f76..1a36fa737 100644 --- a/target/common/common.mk +++ b/target/common/common.mk @@ -151,8 +151,8 @@ $(VSIM_BUILDDIR): mkdir -p $@ define QUESTASIM - ${VSIM} -c -do "source $<; quit" | tee $(dir $<)vsim.log - @! grep -P "Errors: [1-9]*," $(dir $<)vsim.log + ${VSIM} -c -do "source work-vsim/compile.vsim.tcl; quit" | tee $(dir work-vsim/compile.vsim.tcl)vsim.log + @! grep -P "Errors: [1-9]*," $(dir work-vsim/compile.vsim.tcl)vsim.log @mkdir -p $(dir $@) @echo "#!/bin/bash" > $@ @echo 'binary=$$(realpath $$1)' >> $@ diff --git a/target/snitch_cluster/Makefile b/target/snitch_cluster/Makefile index 7b38bbad6..094aa48f6 100644 --- a/target/snitch_cluster/Makefile +++ b/target/snitch_cluster/Makefile @@ -251,7 +251,7 @@ ${VSIM_BUILDDIR}/compile.vsim.tcl: echo 'return 0' >> $@ # Build compilation script and compile all sources for Questasim simulation -bin/snitch_cluster.vsim: ${VSIM_BUILDDIR}/compile.vsim.tcl $(VSIM_SOURCES) ${TB_SRCS} ${TB_CC_SOURCES} ${TB_ASM_SOURCES} work/lib/libfesvr.a +bin/snitch_cluster.vsim: $(VSIM_SOURCES) ${TB_SRCS} ${TB_CC_SOURCES} ${TB_ASM_SOURCES} work/lib/libfesvr.a $(call QUESTASIM,tb_bin) ####### diff --git a/target/snitch_cluster/cfg/default.hjson b/target/snitch_cluster/cfg/default.hjson index 7f28a1073..1432b81d1 100644 --- a/target/snitch_cluster/cfg/default.hjson +++ b/target/snitch_cluster/cfg/default.hjson @@ -16,21 +16,36 @@ cluster_base_hartid: 0, addr_width: 48, data_width: 64, + user_width: 5, // clog2(total number of clusters) tcdm: { - size: 128, + size: 128, // 128 kiB banks: 32, }, cluster_periph_size: 64, // kB zero_mem_size: 64, // kB dma_data_width: 512, - dma_axi_req_fifo_depth: 3, - dma_req_fifo_depth: 3, + dma_axi_req_fifo_depth: 24, + dma_req_fifo_depth: 8, + narrow_trans: 4, + wide_trans: 32, + dma_user_width: 1, + // We don't need Snitch debugging in Occamy + enable_debug: false, + // We don't need Snitch (core-internal) virtual memory support + vm_support: false, + // Memory configuration inputs + sram_cfg_expose: true, + sram_cfg_fields: { + ema: 3, + emaw: 2, + emas: 1 + }, // Timing parameters timing: { - lat_comp_fp32: 3, + lat_comp_fp32: 2, lat_comp_fp64: 3, - lat_comp_fp16: 2, - lat_comp_fp16_alt: 2, + lat_comp_fp16: 1, + lat_comp_fp16_alt: 1, lat_comp_fp8: 1, lat_comp_fp8_alt: 1, lat_noncomp: 1, @@ -43,7 +58,10 @@ register_core_req: true, register_core_rsp: true, register_offload_req: true, - register_offload_rsp: true + register_offload_rsp: true, + register_fpu_req: true, + register_ext_narrow: false, + register_ext_wide: false }, hives: [ // Hive 0 @@ -93,6 +111,7 @@ xf8alt: true, xfdotp: true, xfvec: true, + ssr_nr_credits: 4, num_int_outstanding_loads: 1, num_int_outstanding_mem: 4, num_fp_outstanding_loads: 4, diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile index d86145c85..955b98726 100644 --- a/target/snitch_cluster/sw/apps/Makefile +++ b/target/snitch_cluster/sw/apps/Makefile @@ -6,6 +6,7 @@ SUBDIRS = lto SUBDIRS += nop +SUBDIRS += ata SUBDIRS += blas/axpy SUBDIRS += blas/gemm SUBDIRS += dnn/batchnorm diff --git a/target/snitch_cluster/sw/apps/ata/Makefile b/target/snitch_cluster/sw/apps/ata/Makefile new file mode 100644 index 000000000..12a123773 --- /dev/null +++ b/target/snitch_cluster/sw/apps/ata/Makefile @@ -0,0 +1,10 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +include ../../../../../sw/apps/ata/Makefile +include ../common.mk + +$(DEP): $(DATA_H) diff --git a/target/snitch_cluster/test/testharness.sv b/target/snitch_cluster/test/testharness.sv index afc6972ed..ad70105dd 100644 --- a/target/snitch_cluster/test/testharness.sv +++ b/target/snitch_cluster/test/testharness.sv @@ -22,13 +22,30 @@ module testharness import snitch_cluster_pkg::*; ( wide_in_resp_t wide_in_resp; logic [snitch_cluster_pkg::NrCores-1:0] msip; - snitch_cluster_wrapper i_snitch_cluster ( + snitch_cluster_pkg:: sram_cfgs_t sram_cfgs; + + // EMAS = 1'b0; + assign sram_cfgs.icache_tag.emas = 1'b0; + assign sram_cfgs.icache_data.emas = 1'b0; + assign sram_cfgs.tcdm.emas = 1'b0; + // EMAW = 2'b01; + assign sram_cfgs.icache_tag.emaw = 2'b01; + assign sram_cfgs.icache_data.emaw = 2'b01; + assign sram_cfgs.tcdm.emaw = 2'b01; + // EMA = 3'b010; + assign sram_cfgs.icache_tag.ema = 3'b010; + assign sram_cfgs.icache_data.ema = 3'b010; + assign sram_cfgs.tcdm.ema = 3'b010; + + occamy_cluster_wrapper_snitch_cluster_wrapper i_snitch_cluster_netlist ( .clk_i, .rst_ni, - .debug_req_i ('0), + // .debug_req_i ('0), .meip_i ('0), .mtip_i ('0), .msip_i (msip), + .sram_cfgs_i ('1), // PL without SDF + // .sram_cfgs_i (sram_cfgs), // PL with SDF .narrow_in_req_i (narrow_in_req), .narrow_in_resp_o (narrow_in_resp), .narrow_out_req_o (narrow_out_req), diff --git a/util/sim/data_utils.py b/util/sim/data_utils.py index 2ed260d3f..d51bb0455 100644 --- a/util/sim/data_utils.py +++ b/util/sim/data_utils.py @@ -6,6 +6,16 @@ import struct from datetime import datetime +import numpy as np +import torch +import argparse +import json5 +import pathlib +import humanize +import pyflexfloat as ff + +# Maximum available size in TCDM (in bytes) +TCDM_HEAP_SIZE = 112 * 1024 def emit_license(): @@ -80,3 +90,251 @@ def bytes_to_uint32s(byte_array): uint = struct.unpack('