Skip to content

Commit 6eddf70

Browse files
committed
bug fixes
1 parent fcd81b2 commit 6eddf70

File tree

7 files changed

+1151
-53
lines changed

7 files changed

+1151
-53
lines changed

chat.rapidgpt

Lines changed: 811 additions & 0 deletions
Large diffs are not rendered by default.

hw/rtl/core/VX_lsu_slice.sv

Lines changed: 33 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
105105
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
106106
wire no_rsp_buf_valid, no_rsp_buf_ready;
107107

108+
wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr;
109+
108110
// fence handling
109111

110112
reg fence_lock;
@@ -208,71 +210,53 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
208210
end
209211
end
210212

211-
// track SOP/EOP for out-of-order memory responses
212-
213-
wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr;
213+
// multi-packet load responses could return out-of-order.
214+
// we should track and return eop packet response last.
214215

215216
if (PID_BITS != 0) begin : g_pids
216217
reg [`LSUQ_IN_SIZE-1:0][PID_BITS:0] pkt_ctr;
217-
reg [`LSUQ_IN_SIZE-1:0] pkt_sop, pkt_eop;
218+
reg [`LSUQ_IN_SIZE-1:0] pkt_sop;
219+
reg [`LSUQ_IN_SIZE-1:0] pkt_eop;
218220

219-
wire mem_req_rd_fire = mem_req_fire && ~mem_req_rw;
220-
wire mem_req_rd_sop_fire = mem_req_rd_fire && execute_if.data.sop;
221+
wire mem_req_rd_fire = mem_req_fire && ~mem_req_rw;
221222
wire mem_req_rd_eop_fire = mem_req_rd_fire && execute_if.data.eop;
222-
wire mem_rsp_eop_fire = mem_rsp_fire && mem_rsp_eop;
223-
wire full;
224-
225-
VX_allocator #(
226-
.SIZE (`LSUQ_IN_SIZE)
227-
) pkt_allocator (
228-
.clk (clk),
229-
.reset (reset),
230-
.acquire_en (mem_req_rd_eop_fire),
231-
.acquire_addr(pkt_waddr),
232-
.release_en (mem_rsp_eop_pkt),
233-
.release_addr(pkt_raddr),
234-
`UNUSED_PIN (empty),
235-
.full (full)
236-
);
237-
238-
wire rd_during_wr = mem_req_rd_fire && mem_rsp_eop_fire && (pkt_raddr == pkt_waddr);
223+
wire mem_rsp_eop_fire = mem_rsp_fire && mem_rsp_eop;
224+
225+
assign mem_rsp_sop_pkt = pkt_sop[pkt_raddr];
226+
assign mem_rsp_eop_pkt = mem_rsp_eop && pkt_eop[pkt_raddr] && (pkt_ctr[pkt_raddr] == 1);
239227

240228
always @(posedge clk) begin
241229
if (reset) begin
242-
pkt_ctr <= '0;
243-
pkt_sop <= '0;
244-
pkt_eop <= '0;
245-
end else begin
246-
if (mem_req_rd_sop_fire) begin
247-
pkt_sop[pkt_waddr] <= 1;
230+
for (integer i = 0; i < `LSUQ_IN_SIZE; ++i) begin
231+
pkt_ctr[i] <= '0;
232+
pkt_sop[i] <= 1;
233+
pkt_eop[i] <= 0;
248234
end
235+
end else begin
249236
if (mem_req_rd_eop_fire) begin
250237
pkt_eop[pkt_waddr] <= 1;
251238
end
252-
if (mem_rsp_fire) begin
253-
pkt_sop[pkt_raddr] <= 0;
254-
end
255-
if (mem_rsp_eop_pkt) begin
256-
pkt_eop[pkt_raddr] <= 0;
257-
end
258-
if (~rd_during_wr) begin
239+
if (~(mem_req_rd_fire && mem_rsp_eop_fire && (pkt_raddr == pkt_waddr))) begin
259240
if (mem_req_rd_fire) begin
260241
pkt_ctr[pkt_waddr] <= pkt_ctr[pkt_waddr] + PID_BITS'(1);
261242
end
262243
if (mem_rsp_eop_fire) begin
263244
pkt_ctr[pkt_raddr] <= pkt_ctr[pkt_raddr] - PID_BITS'(1);
264245
end
265246
end
247+
if (mem_rsp_fire) begin
248+
pkt_sop[pkt_raddr] <= 0;
249+
end
250+
if (mem_rsp_eop_fire && mem_rsp_eop_pkt) begin
251+
pkt_sop[pkt_raddr] <= 1;
252+
pkt_eop[pkt_raddr] <= 0;
253+
end
266254
end
267255
end
268-
269-
assign mem_rsp_sop_pkt = pkt_sop[pkt_raddr];
270-
assign mem_rsp_eop_pkt = mem_rsp_eop_fire && pkt_eop[pkt_raddr] && (pkt_ctr[pkt_raddr] == 1);
271-
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("%t: allocator full!", $time))
272-
`RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("%t: oops! broken sop request!", $time))
273-
`UNUSED_VAR (mem_rsp_sop)
256+
`RUNTIME_ASSERT(~(mem_req_rd_fire && pkt_eop[pkt_waddr]), ("%t: oops! broken eop request! (#%0d)", $time, execute_if.data.uuid))
257+
`RUNTIME_ASSERT(~(mem_req_rd_fire && (2**PID_BITS-1) == pkt_ctr[pkt_waddr]), ("%t: oops! broken ctr request! (#%0d)", $time, execute_if.data.uuid))
258+
`RUNTIME_ASSERT(~(mem_rsp_fire && 0 == pkt_ctr[pkt_raddr]), ("%t: oops! broken ctr response! (#%0d)", $time, rsp_uuid))
274259
end else begin : g_no_pids
275-
assign pkt_waddr = 0;
276260
assign mem_rsp_sop_pkt = mem_rsp_sop;
277261
assign mem_rsp_eop_pkt = mem_rsp_eop;
278262
`UNUSED_VAR (pkt_raddr)
@@ -337,8 +321,12 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
337321
.core_req_data (mem_req_data),
338322
.core_req_tag (mem_req_tag),
339323
.core_req_ready (mem_req_ready),
340-
`UNUSED_PIN (core_req_empty),
341-
`UNUSED_PIN (core_req_wr_notify),
324+
.core_req_queue_id (pkt_waddr),
325+
326+
// request queue info
327+
`UNUSED_PIN (req_queue_empty),
328+
`UNUSED_PIN (req_queue_pop),
329+
`UNUSED_PIN (req_queue_id),
342330

343331
// Output response
344332
.core_rsp_valid (mem_rsp_valid),

hw/rtl/libs/VX_mem_scheduler.sv

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ module VX_mem_scheduler #(
3939
parameter MEM_BATCH_BITS= `CLOG2(MEM_BATCHES),
4040
parameter MEM_QUEUE_ADDRW= `CLOG2(COALESCE_ENABLE ? MEM_QUEUE_SIZE : CORE_QUEUE_SIZE),
4141
parameter MEM_ADDR_WIDTH= ADDR_WIDTH - `CLOG2(PER_LINE_REQS),
42-
parameter MEM_TAG_WIDTH = UUID_WIDTH + MEM_QUEUE_ADDRW + MEM_BATCH_BITS
42+
parameter MEM_TAG_WIDTH = UUID_WIDTH + MEM_QUEUE_ADDRW + MEM_BATCH_BITS,
43+
parameter CORE_QUEUE_ADDRW = `CLOG2(CORE_QUEUE_SIZE)
4344
) (
4445
input wire clk,
4546
input wire reset,
@@ -54,8 +55,12 @@ module VX_mem_scheduler #(
5455
input wire [CORE_REQS-1:0][WORD_WIDTH-1:0] core_req_data,
5556
input wire [TAG_WIDTH-1:0] core_req_tag,
5657
output wire core_req_ready,
57-
output wire core_req_empty,
58-
output wire core_req_wr_notify,
58+
output wire [CORE_QUEUE_ADDRW-1:0] core_req_queue_id,
59+
60+
// Core request queue
61+
output wire req_queue_empty,
62+
output wire req_queue_pop,
63+
output wire [CORE_QUEUE_ADDRW-1:0] req_queue_id,
5964

6065
// Core response
6166
output wire core_rsp_valid,
@@ -86,7 +91,6 @@ module VX_mem_scheduler #(
8691
);
8792
localparam BATCH_SEL_WIDTH = `UP(MEM_BATCH_BITS);
8893
localparam STALL_TIMEOUT = 10000000;
89-
localparam CORE_QUEUE_ADDRW= `CLOG2(CORE_QUEUE_SIZE);
9094
localparam TAG_ID_WIDTH = TAG_WIDTH - UUID_WIDTH;
9195
localparam REQQ_TAG_WIDTH = UUID_WIDTH + CORE_QUEUE_ADDRW;
9296
localparam MERGED_TAG_WIDTH= UUID_WIDTH + MEM_QUEUE_ADDRW;
@@ -185,11 +189,13 @@ module VX_mem_scheduler #(
185189
// can accept another request?
186190
assign core_req_ready = reqq_ready_in && ibuf_ready;
187191

188-
// no pending requests
189-
assign core_req_empty = !reqq_valid && ibuf_empty;
192+
// return core queue id
193+
assign core_req_queue_id = ibuf_waddr;
190194

191-
// notify write request submisison
192-
assign core_req_wr_notify = reqq_valid && reqq_ready && reqq_rw;
195+
// request qeueue info
196+
assign req_queue_pop = reqq_valid && reqq_ready;
197+
assign req_queue_empty = !reqq_valid && ibuf_empty;
198+
assign req_queue_id = reqq_tag[CORE_QUEUE_ADDRW-1:0];
193199

194200
// Index buffer ///////////////////////////////////////////////////////////
195201

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
ROOT_DIR := $(realpath ../../..)
2+
include $(ROOT_DIR)/config.mk
3+
4+
PROJECT := sgemm_tpu
5+
6+
SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
7+
8+
SRCS := $(SRC_DIR)/main.cpp
9+
10+
VX_SRCS := $(SRC_DIR)/kernel.cpp
11+
12+
OPTS ?= -n32
13+
14+
include ../common.mk
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#ifndef _COMMON_H_
2+
#define _COMMON_H_
3+
4+
#ifndef TYPE
5+
#define TYPE float
6+
#endif
7+
8+
typedef struct {
9+
uint32_t grid_dim[2];
10+
uint32_t size;
11+
uint64_t A_addr;
12+
uint64_t B_addr;
13+
uint64_t C_addr;
14+
} kernel_arg_t;
15+
16+
#endif
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#include <vx_spawn.h>
2+
#include "common.h"
3+
4+
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
5+
auto A = reinterpret_cast<TYPE*>(arg->A_addr);
6+
auto B = reinterpret_cast<TYPE*>(arg->B_addr);
7+
auto C = reinterpret_cast<TYPE*>(arg->C_addr);
8+
auto size = arg->size;
9+
10+
int col = blockIdx.x;
11+
int row = blockIdx.y;
12+
13+
TYPE sum(0);
14+
for (int e = 0; e < size; ++e) {
15+
sum += A[row * size + e] * B[e * size + col];
16+
}
17+
18+
C[row * size + col] = sum;
19+
}
20+
21+
int main() {
22+
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
23+
return vx_spawn_threads(2, arg->grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg);
24+
}

0 commit comments

Comments
 (0)