forked from peterpengwei/bwa-mem-sw
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbatch_manager.v
927 lines (875 loc) · 46.6 KB
/
batch_manager.v
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
// ***************************************************************************
//
// Batch Manager
//
// Engineer: Peng Wei
// Create Date: Feb 12, 2015
// Module Name: batch_manager
// Description: manage each PE array's task batch buffer and result
// batch buffer
// it instantiates TBBs and RBBs
// ***************************************************************************
//
// CSR Address Map -- Change v1.1
//------------------------------------------------------------------------------------------
// Address[15:0] Attribute Name Comments
// 'h1A00 WO CSR_AFU_DSM_BASEL Lower 32-bits of AFU DSM base address. The lower 6-bbits are 4x00 since the address is cache aligned.
// 'h1A04 WO CSR_AFU_DSM_BASEH Upper 32-bits of AFU DSM base address.
// 'h1A20: WO CSR_SRC_ADDR Start physical address for source buffer. All read requests are targetted to this region.
// 'h1A24: WO CSR_DST_ADDR Start physical address for destination buffer. All write requests are targetted to this region.
// 'h1A28: WO CSR_NUM_BATCHES Number of cache lines
// 'h1A2c: WO CSR_CTL Controls test flow, start, stop, force completion
// 'h1A34: WO CSR_CFG Configures test parameters
// 'h1A38: WO CSR_INACT_THRESH inactivity threshold limit
// 'h1A3c WO CSR_INTERRUPT0 SW allocates Interrupt APIC ID & Vector to device
//
//
// DSM Offeset Map -- Change v1.1
//------------------------------------------------------------------------------------------
// Byte Offset Attribute Name Comments
// 0x00 RO DSM_AFU_ID non-zero value to uniquely identify the AFU
// 0x40 RO DSM_STATUS test status and error register
//
//
// 1 Cacheline = 64B i.e 2^6 Bytes
// Let N be the number of cachelines in the source & destination buffers. Then select CSR_SRC_ADDR & CSR_DEST_ADDR to be 2^(N+6) aligned.
// CSR_NUM_BATCHES should be less than or equal to N.
//
// CSR_SRC_ADDR:
// [31:N] WO 2^(N+6)MB aligned address points to the start of read buffer
// [N-1:0] WO 'h0
//
// CSR_DST_ADDR:
// [31:N] WO 2^(N+6)MB aligned address points to the start of write buffer
// [N-1:0] WO 'h0
//
// CSR_NUM_BATCHES:
// [31:N] WO 'h0
// [N-1:0] WO # cache lines to be read/written to. This threshold may be different for each test AFU. IMPORTANT- Ensure that source and destination buffers
// are large enough to accomodate the N cache lines.
//
// Let's assume N=14, then CSR_SRC_ADDR and CSR_DST_ADDR will accept a 2^20, i.e. 1MB aligned addresses.
//
// CSR_SRC_ADDR:
// [31:14] WO 1MB aligned address
// [13:0] WO 'h0
//
// CSR_DST_ADDR:
// [31:14] WO 1MB aligned address
// [13:0] WO 'h0
//
// CSR_NUM_BATCHES:
// [31:14] WO 'h0
// [13:0] WO # cache lines to be read/written to. This threshold may be different for each test AFU. IMPORTANT- Ensure that source and destination buffers
// are large enough to accomodate the # cache lines.
//
// CSR_CTL:
// [31:3] WO Rsvd
// [2] WO Force test completion. Writes test completion flag and other performance counters to csr_stat. It appears to be like a normal test completion.
// [1] WO Starts test execution.
// [0] WO Active low test Reset. All configuration parameters change to reset defaults.
//
//
// CSR_CFG:
// [29] WO cr_interrupt_testmode - used to test interrupt. Generates an interrupt at end of each test.
// [28] WO cr_interrupt_on_error - send an interrupt when error detected
// [27:20] WO cr_test_cfg -may be used to configure the behavior of each test mode
// [10:9] WO cr_rdsel -configure read request type. 0- RdLine_S, 1- RdLine_I, 2- RdLine_O, 3- Mixed mode
// [8] WO cr_delay_en -enable random delay insertion between requests
// [4:2] WO cr_mode -configures test mode
// [1] WO cr_cont - 1- test rollsover to start address after it reaches the CSR_NUM_BATCHES count. Such a test terminates only on an error.
// 0- test terminates, updated the status csr when CSR_NUM_BATCHES count is reached.
// [0] WO cr_wrthru_en -switch between write back to write through request type. 0- Wr Back, 1- WrThru
//
//
// CSR_INACT_THRESHOLD:
// [31:0] WO inactivity threshold limit. The idea is to detect longer duration of stalls during a test run. Inactivity counter will count number of consecutive idle cycles,
// i.e. no requests are sent and no responses are received. If the inactivity count > CSR_INACT_THRESHOLD then it sets the inact_timeout signal. The inactivity counter
// is activated only after test is started by writing 1 to CSR_CTL[1].
//
// CSR_INTERRUPT0:
// [23:16] WO vector - Interrupt Vector # for the device
// [15:0] WO apic id - Interrupt APIC ID for the device
//
// DSM_STATUS:
// [511:256] RO Error dump from Test Mode
// [255:224] RO end overhead
// [223:192] RO start overhead
// [191:160] RO Number of writes
// [159:128] RO Number of reads
// [127:64] RO Number of clocks
// [63:32] RO test error register
// [31:0] RO test completion flag
//
// DSM_AFU_ID:
// [512:144] RO Zeros
// [143:128] RO Version
// [127:0] RO AFU ID
module batch_manager #(parameter TBB_WR_ADDR_WIDTH=12,
TBB_WR_DATA_WIDTH=512,
TBB_RD_ADDR_WIDTH=16,
TBB_RD_DATA_WIDTH=32,
RBB_ADDR_WIDTH=8,
RBB_DATA_WIDTH=512,
NUM_PEA=4,
TXHDR_WIDTH=61,
RXHDR_WIDTH=61,
DATA_WIDTH=512)
(
// ---------------------------global signals-------------------------------------------------
clk, // in std_logic; -- Core clock
reset_n, // in std_logic; -- Use SPARINGLY only for control
// ---------------------------IF signals between SPL and FPL --------------------------------
rb2cf_C0RxHdr, // [RXHDR_WIDTH-1:0] cci_intf: Rx header to SPL channel 0
rb2cf_C0RxData, // [DATA_WIDTH -1:0] cci_intf: Rx data response to SPL | no back pressure
rb2cf_C0RxWrValid, // cci_intf: Rx write response enable
rb2cf_C0RxRdValid, // cci_intf: Rx read response enable
rb2cf_C0RxCfgValid, // cci_intf: Rx config response enable
//rb2cf_C0RxUMsgValid, // cci_intf: Rx UMsg valid
//rb2cf_C0RxIntrValid, // cci_intf: Rx interrupt valid
rb2cf_C1RxHdr, // [RXHDR_WIDTH-1:0] cci_intf: Rx header to SPL channel 1
rb2cf_C1RxWrValid, // cci_intf: Rx write response valid
//rb2cf_C1RxIntrValid, // cci_intf: Rx interrupt valid
cf2ci_C0TxHdr, // [TXHDR_WIDTH-1:0] cci_intf: Tx Header from SPL channel 0
cf2ci_C0TxRdValid, // cci_intf: Tx read request enable
cf2ci_C1TxHdr, // cci_intf: Tx Header from SPL channel 1
cf2ci_C1TxData, // cci_intf: Tx data from SPL
cf2ci_C1TxWrValid, // cci_intf: Tx write request enable
//cf2ci_C1TxIntrValid, // cci_intf: Tx interrupt valid
ci2cf_C0TxAlmFull, // cci_intf: Tx memory channel 0 almost full
ci2cf_C1TxAlmFull, // cci_intf: TX memory channel 1 almost full
ci2cf_InitDn, // Link initialization is complete
bm2pe_start_b, // Start the task
pe2bm_done_b, // Task done
pe2bm_rbbWrEn_b,
pe2bm_rbbWrAddr_b,
pe2bm_rbbWrDin_b,
pe2bm_tbbRdAddr_b,
bm2pe_tbbRdDout_b
);
input clk; // in std_logic; -- Core clock
input reset_n; // in std_logic; -- Use SPARINGLY only for control
input [RXHDR_WIDTH-1:0] rb2cf_C0RxHdr; // [RXHDR_WIDTH-1:0]cci_intf: Rx header to SPL channel 0
input [DATA_WIDTH -1:0] rb2cf_C0RxData; // [DATA_WIDTH -1:0]cci_intf: data response to SPL | no back pressure
input rb2cf_C0RxWrValid; // cci_intf: write response enable
input rb2cf_C0RxRdValid; // cci_intf: read response enable
input rb2cf_C0RxCfgValid; // cci_intf: config response enable
//input rb2cf_C0RxUMsgValid; // cci_intf: Rx UMsg valid
//input rb2cf_C0RxIntrValid; // cci_intf: interrupt response enable
input [RXHDR_WIDTH-1:0] rb2cf_C1RxHdr; // [RXHDR_WIDTH-1:0]cci_intf: Rx header to SPL channel 1
input rb2cf_C1RxWrValid; // cci_intf: write response valid
//input rb2cf_C1RxIntrValid; // cci_intf: interrupt response valid
output [TXHDR_WIDTH-1:0] cf2ci_C0TxHdr; // [TXHDR_WIDTH-1:0]cci_intf: Tx Header from SPL channel 0
output cf2ci_C0TxRdValid; // cci_intf: Tx read request enable
output [TXHDR_WIDTH-1:0] cf2ci_C1TxHdr; // cci_intf: Tx Header from SPL channel 1
output [DATA_WIDTH -1:0] cf2ci_C1TxData; // cci_intf: Tx data from SPL
output cf2ci_C1TxWrValid; // cci_intf: Tx write request enable
//output cf2ci_C1TxIntrValid; // cci_intf: Tx interrupt valid
input ci2cf_C0TxAlmFull; // cci_intf: Tx memory channel 0 almost full
input ci2cf_C1TxAlmFull; // cci_intf: TX memory channel 1 almost full
input ci2cf_InitDn; // cci_intf: Link initialization is complete
output [NUM_PEA-1:0] bm2pe_start_b;
input [NUM_PEA-1:0] pe2bm_done_b;
input [NUM_PEA-1:0] pe2bm_rbbWrEn_b;
input [RBB_ADDR_WIDTH*NUM_PEA-1:0] pe2bm_rbbWrAddr_b;
input [RBB_DATA_WIDTH*NUM_PEA-1:0] pe2bm_rbbWrDin_b;
input [TBB_RD_ADDR_WIDTH*NUM_PEA-1:0] pe2bm_tbbRdAddr_b;
output [TBB_RD_DATA_WIDTH*NUM_PEA-1:0] bm2pe_tbbRdDout_b;
//----------------------------------------------------------------------------------------------------------------------
// NLB v1.1 AFU ID
localparam BWA_MEM_SW = 128'h2015_0212_900d_beef_0000_0000_0000_0000;
localparam VERSION = 16'h0001;
//---------------------------------------------------------
// CCI-S Request Encodings ***** DO NOT MODIFY ******
//---------------------------------------------------------
localparam WrThru = 4'h1;
localparam WrLine = 4'h2;
localparam RdLine = 4'h4;
localparam WrFence = 4'h5;
//--------------------------------------------------------
// CCI-S Response Encodings ***** DO NOT MODIFY ******
//--------------------------------------------------------
localparam RSP_CSR = 4'h0;
localparam RSP_WRITE = 4'h1;
localparam RSP_READ = 4'h4;
//---------------------------------------------------------
// Default Values ****** May be MODIFIED *******
//---------------------------------------------------------
localparam DEF_SRC_ADDR = 32'h0400_0000; // Read data starting from here. Cache aligned Address
localparam DEF_DST_ADDR = 32'h0500_0000; // Copy data to here. Cache aligned Address
localparam DEF_DSM_BASE = 32'h04ff_ffff; // default status address
//---------------------------------------------------------
// CSR Address Map ***** DO NOT MODIFY *****
//---------------------------------------------------------
localparam CSR_AFU_DSM_BASEL = 16'h1a00; // WO - Lower 32-bits of AFU DSM base address. The lower 6-bbits are 4x00 since the address is cache aligned.
localparam CSR_AFU_DSM_BASEH = 16'h1a04; // WO - Upper 32-bits of AFU DSM base address.
localparam CSR_SRC_ADDR = 16'h1a20; // WO Reads are targetted to this region
localparam CSR_DST_ADDR = 16'h1a24; // WO Writes are targetted to this region
localparam CSR_NUM_BATCHES = 16'h1a28; // WO Numbers of task batches to be read/write
localparam CSR_CTL = 16'h1a2c; // WO Control CSR to start n stop the test
//----------------------------------------------------------------------------------
// Device Status Memory (DSM) Address Map ***** DO NOT MODIFY *****
// Physical address = value at CSR_AFU_DSM_BASE + Byte offset
//----------------------------------------------------------------------------------
// Byte Offset Attribute Width Comments
localparam DSM_AFU_ID = 32'h0; // RO 32b non-zero value to uniquely identify the AFU
localparam DSM_STATUS = 32'h40; // RO 512b test status and error info
localparam POINTER_WIDTH = NUM_PEA<=2 ? 1 : (NUM_PEA<=4 ? 2 : -1);
localparam RD_REQ_POINTER_WIDTH = 14 - TBB_WR_ADDR_WIDTH;
//----------------------------------------------------------------------------------------------------------------------
reg [NUM_PEA-1:0] tbbWrEn_b;
reg [TBB_WR_ADDR_WIDTH*NUM_PEA-1:0] tbbWrAddr_b;
reg [TBB_WR_DATA_WIDTH*NUM_PEA-1:0] tbbWrDin_b;
wire [NUM_PEA-1:0] tbbFull_b;
wire [TBB_RD_ADDR_WIDTH*NUM_PEA-1:0] tbbRdAddr_b;
wire [TBB_RD_DATA_WIDTH*NUM_PEA-1:0] tbbRdDout_b;
wire [NUM_PEA-1:0] tbbEmpty_b;
wire [NUM_PEA-1:0] tbbReqValid_b;
wire [NUM_PEA*TBB_WR_ADDR_WIDTH-1:0] tbbReqLineIdx_b;
reg [NUM_PEA-1:0] tbbReqAck_b;
assign tbbRdAddr_b = pe2bm_tbbRdAddr_b;
assign bm2pe_tbbRdDout_b = tbbRdDout_b;
wire [NUM_PEA-1:0] rbbWrEn_b;
wire [RBB_ADDR_WIDTH*NUM_PEA-1:0] rbbWrAddr_b;
wire [RBB_DATA_WIDTH*NUM_PEA-1:0] rbbWrDin_b;
wire [NUM_PEA-1:0] rbbFull_b;
wire [RBB_ADDR_WIDTH*NUM_PEA-1:0] rbbRdAddr_b;
wire [RBB_DATA_WIDTH*NUM_PEA-1:0] rbbRdDout_b;
wire [NUM_PEA-1:0] rbbEmpty_b;
wire [NUM_PEA-1:0] rbbReqValid_b;
wire [NUM_PEA*RBB_ADDR_WIDTH-1:0] rbbReqLineIdx_b;
reg [NUM_PEA-1:0] rbbReqAck_b;
assign rbbWrEn_b = pe2bm_rbbWrEn_b;
assign rbbWrAddr_b = pe2bm_rbbWrAddr_b;
assign rbbWrDin_b = pe2bm_rbbWrDin_b;
reg [DATA_WIDTH-1:0] cf2ci_C1TxData;
reg [TXHDR_WIDTH-1:0] cf2ci_C1TxHdr;
reg cf2ci_C1TxWrValid;
reg [TXHDR_WIDTH-1:0] cf2ci_C0TxHdr;
reg cf2ci_C0TxRdValid;
reg dsm_base_valid;
reg afuid_updtd;
reg [63:0] cr_dsm_base; // a00h, a04h - DSM base address
reg [31:0] cr_src_address; // a20h - source buffer address
reg [31:0] cr_dst_address; // a24h - destn buffer address
reg [31:0] cr_num_batches; // a28h - Number of batches available for processing
reg [31:0] cr_ctl = 0; // a2ch - control register to start and stop the test
wire test_go = cr_ctl[1]; // When 0, it allows reconfiguration of test parameters.
//register for storing number of task batches that get received
reg [31:0] NumBatchesRecv;
reg [31:0] NumBatchesRecv_d;
//pointer to TBB
reg [POINTER_WIDTH-1:0] tbb_pointer;
reg [POINTER_WIDTH-1:0] tbb_pointer_d;
//pointer to RBB
reg [POINTER_WIDTH-1:0] rbb_pointer;
reg [POINTER_WIDTH-1:0] rbb_pointer_d;
//CCI Read Address Offset
reg [TBB_WR_ADDR_WIDTH-1:0] RdAddrOffset;
reg [TBB_WR_ADDR_WIDTH-1:0] RdAddrOffset_d;
//CCI Read ID
reg [13:0] RdReqId;
//CCI Read Type
wire [3:0] rdreq_type = RdLine;
//CCI Write Address Offset
reg [RBB_ADDR_WIDTH-1:0] WrAddrOffset;
reg [RBB_ADDR_WIDTH-1:0] WrAddrOffset_d;
//CCI Write ID
reg [13:0] WrReqId;
//CCI Write Type
wire [3:0] wrreq_type = WrLine;
wire [31:0] ds_afuid_address = dsm_offset2addr(DSM_AFU_ID,cr_dsm_base); // 0h - afu id is written to this address
wire [31:0] ds_stat_address = dsm_offset2addr(DSM_STATUS,cr_dsm_base); // 40h - test status is written to this address
wire re2xy_go = test_go & afuid_updtd & ci2cf_InitDn; // After initializing DSM, we can do actual tasks on AFU
reg WrHdr_valid; // 1: Valid Write Request
reg RdHdr_valid; // 1: Valid Read Request
//-------------------------
//CSR Register Handling
//-------------------------
always @(posedge clk)
begin
if(!reset_n)
begin
cr_dsm_base <= DEF_DSM_BASE;
cr_src_address <= DEF_SRC_ADDR;
cr_dst_address <= DEF_DST_ADDR;
cr_num_batches <= 'b0;
cr_ctl <= 'b0;
dsm_base_valid <= 'b0;
end
else
begin
//control register can be written anytime after resetting
if(rb2cf_C0RxCfgValid)
case({rb2cf_C0RxHdr[13:0],2'b00}) /* synthesis parallel_case */
CSR_CTL : cr_ctl <= rb2cf_C0RxData[31:0];
endcase
if(~test_go) // Configuration Mode, following CSRs can only be updated in this mode
begin
if(rb2cf_C0RxCfgValid)
case({rb2cf_C0RxHdr[13:0],2'b00}) /* synthesis parallel_case */
CSR_SRC_ADDR: cr_src_address <= rb2cf_C0RxData[31:0];
CSR_DST_ADDR: cr_dst_address <= rb2cf_C0RxData[31:0];
CSR_AFU_DSM_BASEH: cr_dsm_base[63:32] <= rb2cf_C0RxData[31:0];
CSR_AFU_DSM_BASEL:begin
cr_dsm_base[31:0] <= rb2cf_C0RxData[31:0];
dsm_base_valid <= 'b1;
end
endcase
end
if(re2xy_go) // Execution Mode, following CSRs can only be updated in this mode
begin
if(rb2cf_C0RxCfgValid)
case({rb2cf_C0RxHdr[13:0],2'b00}) /* synthesis parallel_case */
//cr_num_batches corresponds to the number of task batches available for processing
CSR_NUM_BATCHES: cr_num_batches <= cr_num_batches + 'b1;
endcase
end
end
end
//-------------------------
//Round-Robin Load/Store
//-------------------------
//Sequential Logic
always @ (posedge clk)
begin
if (!reset_n)
begin
NumBatchesRecv <= 'b0;
tbb_pointer <= 'b0;
rbb_pointer <= 'b0;
RdAddrOffset <= 'b0;
WrAddrOffset <= 'b0;
end
else
begin
NumBatchesRecv <= NumBatchesRecv_d;
tbb_pointer <= tbb_pointer_d;
rbb_pointer <= rbb_pointer_d;
RdAddrOffset <= RdAddrOffset_d;
WrAddrOffset <= WrAddrOffset_d;
end
end
//Combinatorial Logic
always @ (*)
begin
tbb_pointer_d = tbb_pointer;
rbb_pointer_d = rbb_pointer;
RdAddrOffset_d = RdAddrOffset;
WrAddrOffset_d = WrAddrOffset;
NumBatchesRecv_d = NumBatchesRecv;
RdHdr_valid = 'b0;
WrHdr_valid = 'b0;
tbbReqAck_b = 'b0;
rbbReqAck_b = 'b0;
RdReqId = 'b0;
WrReqId = 'b0;
if (re2xy_go) //During the real execution state, do the modification on these registers
begin
//tbb handler
case(tbb_pointer) /* synthesis parallel_case */
'b0:
begin
//If current tbb is full, jump it without doing anything
if (tbbFull_b[0])
begin
if (tbb_pointer == (NUM_PEA-1))
tbb_pointer_d = 'b0;
else
tbb_pointer_d = tbb_pointer + 'b1;
end
//If tbb is not full, it can store a valid task
//Then, check if there is a valid task to process
else if (NumBatchesRecv != cr_num_batches)
begin
//We have at least one available batch to process
//Then, check if the TBB is willing to read data
if (tbbReqValid_b[0] && !ci2cf_C0TxAlmFull) //Not full, Available Batch, Request Valid and CCI not stalled, then just fetch data
begin
RdHdr_valid = 'b1; // a valid read request, 100% sent
RdAddrOffset_d = RdAddrOffset + 'd64; // update address since the current one has been sent
RdReqId[2+TBB_WR_ADDR_WIDTH-1:0] = {2'b0, tbbReqLineIdx_b[0*TBB_WR_ADDR_WIDTH+TBB_WR_ADDR_WIDTH-1:0*TBB_WR_ADDR_WIDTH]}; //Request ID is generated by combining the pointer and the line index
tbbReqAck_b[0] = 'b1; // tell the TBB that the request has been sent to CCI
end
//if TBB is willing to read data but CCI is stalled, then do nothing
else if (tbbReqValid_b[0])
begin
//do nothing
end
//if TBB is not willing to read data, then we can infer that the TBB has sent all the read requests
//We can go to next TBB, and meanwhile claim that a batch has been received (since all the requests of a batch have been sent to CCI)
else
begin
if (tbb_pointer == (NUM_PEA-1))
tbb_pointer_d = 'b0;
else
tbb_pointer_d = tbb_pointer + 'b1;
NumBatchesRecv_d = NumBatchesRecv + 'b1;
end
end
end
'b1:
begin
//If current tbb is full, jump it without doing anything
if (tbbFull_b[1])
begin
if (tbb_pointer == (NUM_PEA-1))
tbb_pointer_d = 'b0;
else
tbb_pointer_d = tbb_pointer + 'b1;
end
//If tbb is not full, it can store a valid task
//Then, check if there is a valid task to process
else if (NumBatchesRecv != cr_num_batches)
begin
//We have at least one available batch to process
//Then, check if the TBB is willing to read data
if (tbbReqValid_b[1] && !ci2cf_C0TxAlmFull) //Not full, Available Batch, Request Valid and CCI not stalled, then just fetch data
begin
RdHdr_valid = 'b1; // a valid read request, 100% sent
RdAddrOffset_d = RdAddrOffset + 'd64; // update address since the current one has been sent
RdReqId[2+TBB_WR_ADDR_WIDTH-1:0] = {2'b1, tbbReqLineIdx_b[1*TBB_WR_ADDR_WIDTH+TBB_WR_ADDR_WIDTH-1:1*TBB_WR_ADDR_WIDTH]}; //Request ID is generated by combining the pointer and the line index
tbbReqAck_b[1] = 'b1; // tell the TBB that the request has been sent to CCI
end
//if TBB is willing to read data but CCI is stalled, then do nothing
else if (tbbReqValid_b[1])
begin
//do nothing
end
//if TBB is not willing to read data, then we can infer that the TBB has sent all the read requests
//We can go to next TBB, and meanwhile claim that a batch has been received (since all the requests of a batch have been sent to CCI)
else
begin
if (tbb_pointer == (NUM_PEA-1))
tbb_pointer_d = 'b0;
else
tbb_pointer_d = tbb_pointer + 'b1;
NumBatchesRecv_d = NumBatchesRecv + 'b1;
end
end
end
'b10:
begin
//If current tbb is full, jump it without doing anything
if (tbbFull_b[2])
begin
if (tbb_pointer == (NUM_PEA-1))
tbb_pointer_d = 'b0;
else
tbb_pointer_d = tbb_pointer + 'b1;
end
//If tbb is not full, it can store a valid task
//Then, check if there is a valid task to process
else if (NumBatchesRecv != cr_num_batches)
begin
//We have at least one available batch to process
//Then, check if the TBB is willing to read data
if (tbbReqValid_b[2] && !ci2cf_C0TxAlmFull) //Not full, Available Batch, Request Valid and CCI not stalled, then just fetch data
begin
RdHdr_valid = 'b1; // a valid read request, 100% sent
RdAddrOffset_d = RdAddrOffset + 'd64; // update address since the current one has been sent
RdReqId[2+TBB_WR_ADDR_WIDTH-1:0] = {2'b10, tbbReqLineIdx_b[2*TBB_WR_ADDR_WIDTH+TBB_WR_ADDR_WIDTH-1:2*TBB_WR_ADDR_WIDTH]}; //Request ID is generated by combining the pointer and the line index
tbbReqAck_b[2] = 'b1; // tell the TBB that the request has been sent to CCI
end
//if TBB is willing to read data but CCI is stalled, then do nothing
else if (tbbReqValid_b[2])
begin
//do nothing
end
//if TBB is not willing to read data, then we can infer that the TBB has sent all the read requests
//We can go to next TBB, and meanwhile claim that a batch has been received (since all the requests of a batch have been sent to CCI)
else
begin
if (tbb_pointer == (NUM_PEA-1))
tbb_pointer_d = 'b0;
else
tbb_pointer_d = tbb_pointer + 'b1;
NumBatchesRecv_d = NumBatchesRecv + 'b1;
end
end
end
'b11:
begin
//If current tbb is full, jump it without doing anything
if (tbbFull_b[3])
begin
if (tbb_pointer == (NUM_PEA-1))
tbb_pointer_d = 'b0;
else
tbb_pointer_d = tbb_pointer + 'b1;
end
//If tbb is not full, it can store a valid task
//Then, check if there is a valid task to process
else if (NumBatchesRecv != cr_num_batches)
begin
//We have at least one available batch to process
//Then, check if the TBB is willing to read data
if (tbbReqValid_b[3] && !ci2cf_C0TxAlmFull) //Not full, Available Batch, Request Valid and CCI not stalled, then just fetch data
begin
RdHdr_valid = 'b1; // a valid read request, 100% sent
RdAddrOffset_d = RdAddrOffset + 'd64; // update address since the current one has been sent
RdReqId[2+TBB_WR_ADDR_WIDTH-1:0] = {2'b11, tbbReqLineIdx_b[3*TBB_WR_ADDR_WIDTH+TBB_WR_ADDR_WIDTH-1:3*TBB_WR_ADDR_WIDTH]}; //Request ID is generated by combining the pointer and the line index
tbbReqAck_b[3] = 'b1; // tell the TBB that the request has been sent to CCI
end
//if TBB is willing to read data but CCI is stalled, then do nothing
else if (tbbReqValid_b[3])
begin
//do nothing
end
//if TBB is not willing to read data, then we can infer that the TBB has sent all the read requests
//We can go to next TBB, and meanwhile claim that a batch has been received (since all the requests of a batch have been sent to CCI)
else
begin
if (tbb_pointer == (NUM_PEA-1))
tbb_pointer_d = 'b0;
else
tbb_pointer_d = tbb_pointer + 'b1;
NumBatchesRecv_d = NumBatchesRecv + 'b1;
end
end
end
endcase
//rbb handler
case(rbb_pointer) /* synthesis parallel_case */
'b0:
begin
//If current rbb is empty, jump it without doing anything
if (rbbEmpty_b[0])
begin
if (rbb_pointer == (NUM_PEA-1))
rbb_pointer_d = 'b0;
else
rbb_pointer_d = rbb_pointer + 'b1;
end
//If current rbb is not empty, we are able to send data back to CCI
else
begin
//check if the RBB is willing to write data
if (rbbReqValid_b[0] && !ci2cf_C1TxAlmFull) //Not empty, Request Valid and CCI not stalled, then just write back data
begin
WrHdr_valid = 'b1; // a valid write request, 100% sent
WrAddrOffset_d = WrAddrOffset + 'd64; // update address since the current one has been sent
WrReqId[2+RBB_ADDR_WIDTH-1:0] = {2'd0, rbbReqLineIdx_b[0*RBB_ADDR_WIDTH+RBB_ADDR_WIDTH-1:0*RBB_ADDR_WIDTH]}; //Request ID is generated by combining the pointer and the line index
rbbReqAck_b[0] = 'b1; // tell the RBB that the request has been sent to CCI
end
//if RBB is willing to write data but CCI is stalled, then do nothing
else if (rbbReqValid_b[0])
begin
//do nothing
end
//if RBB is not willing to write data, then we can infer that the RBB has sent all the write requests
//We can go to next RBB, and meanwhile claim that a result batch has been sent back
//Question: How to let CPU know it?
else
begin
if (rbb_pointer == (NUM_PEA-1))
rbb_pointer_d = 'b0;
else
rbb_pointer_d = rbb_pointer + 'b1;
end
end
end
'b1:
begin
//If current rbb is empty, jump it without doing anything
if (rbbEmpty_b[1])
begin
if (rbb_pointer == (NUM_PEA-1))
rbb_pointer_d = 'b0;
else
rbb_pointer_d = rbb_pointer + 'b1;
end
//If current rbb is not empty, we are able to send data back to CCI
else
begin
//check if the RBB is willing to write data
if (rbbReqValid_b[1] && !ci2cf_C1TxAlmFull) //Not empty, Request Valid and CCI not stalled, then just write back data
begin
WrHdr_valid = 'b1; // a valid write request, 100% sent
WrAddrOffset_d = WrAddrOffset + 'd64; // update address since the current one has been sent
WrReqId[2+RBB_ADDR_WIDTH-1:0] = {2'd1, rbbReqLineIdx_b[1*RBB_ADDR_WIDTH+RBB_ADDR_WIDTH-1:1*RBB_ADDR_WIDTH]}; //Request ID is generated by combining the pointer and the line index
rbbReqAck_b[1] = 'b1; // tell the RBB that the request has been sent to CCI
end
//if RBB is willing to write data but CCI is stalled, then do nothing
else if (rbbReqValid_b[1])
begin
//do nothing
end
//if RBB is not willing to write data, then we can infer that the RBB has sent all the write requests
//We can go to next RBB, and meanwhile claim that a result batch has been sent back
//Question: How to let CPU know it?
else
begin
if (rbb_pointer == (NUM_PEA-1))
rbb_pointer_d = 'b0;
else
rbb_pointer_d = rbb_pointer + 'b1;
end
end
end
'b10:
begin
//If current rbb is empty, jump it without doing anything
if (rbbEmpty_b[2])
begin
if (rbb_pointer == (NUM_PEA-1))
rbb_pointer_d = 'b0;
else
rbb_pointer_d = rbb_pointer + 'b1;
end
//If current rbb is not empty, we are able to send data back to CCI
else
begin
//check if the RBB is willing to write data
if (rbbReqValid_b[2] && !ci2cf_C1TxAlmFull) //Not empty, Request Valid and CCI not stalled, then just write back data
begin
WrHdr_valid = 'b1; // a valid write request, 100% sent
WrAddrOffset_d = WrAddrOffset + 'd64; // update address since the current one has been sent
WrReqId[2+RBB_ADDR_WIDTH-1:0] = {2'd2, rbbReqLineIdx_b[2*RBB_ADDR_WIDTH+RBB_ADDR_WIDTH-1:2*RBB_ADDR_WIDTH]}; //Request ID is generated by combining the pointer and the line index
rbbReqAck_b[2] = 'b1; // tell the RBB that the request has been sent to CCI
end
//if RBB is willing to write data but CCI is stalled, then do nothing
else if (rbbReqValid_b[2])
begin
//do nothing
end
//if RBB is not willing to write data, then we can infer that the RBB has sent all the write requests
//We can go to next RBB, and meanwhile claim that a result batch has been sent back
//Question: How to let CPU know it?
else
begin
if (rbb_pointer == (NUM_PEA-1))
rbb_pointer_d = 'b0;
else
rbb_pointer_d = rbb_pointer + 'b1;
end
end
end
'b11:
begin
//If current rbb is empty, jump it without doing anything
if (rbbEmpty_b[3])
begin
if (rbb_pointer == (NUM_PEA-1))
rbb_pointer_d = 'b0;
else
rbb_pointer_d = rbb_pointer + 'b1;
end
//If current rbb is not empty, we are able to send data back to CCI
else
begin
//check if the RBB is willing to write data
if (rbbReqValid_b[3] && !ci2cf_C1TxAlmFull) //Not empty, Request Valid and CCI not stalled, then just write back data
begin
WrHdr_valid = 'b1; // a valid write request, 100% sent
WrAddrOffset_d = WrAddrOffset + 'd64; // update address since the current one has been sent
WrReqId[2+RBB_ADDR_WIDTH-1:0] = {2'd3, rbbReqLineIdx_b[3*RBB_ADDR_WIDTH+RBB_ADDR_WIDTH-1:3*RBB_ADDR_WIDTH]}; //Request ID is generated by combining the pointer and the line index
rbbReqAck_b[3] = 'b1; // tell the RBB that the request has been sent to CCI
end
//if RBB is willing to write data but CCI is stalled, then do nothing
else if (rbbReqValid_b[3])
begin
//do nothing
end
//if RBB is not willing to write data, then we can infer that the RBB has sent all the write requests
//We can go to next RBB, and meanwhile claim that a result batch has been sent back
//Question: How to let CPU know it?
else
begin
if (rbb_pointer == (NUM_PEA-1))
rbb_pointer_d = 'b0;
else
rbb_pointer_d = rbb_pointer + 'b1;
end
end
end
endcase
end
end
//-------------------------
//Handle CCI Tx Channels
//-------------------------
// Format Read Header
wire [31:0] RdAddr = cr_src_address ^ RdAddrOffset;
wire [TXHDR_WIDTH-1:0] RdHdr = {
5'h00, // [60:56] Byte Enable
rdreq_type, // [55:52] Request Type
6'h00, // [51:46] Rsvd
RdAddr, // [45:14] Address
RdReqId // [13:0] Meta data to track the SPL requests
};
// Format Write Header
wire [31:0] WrAddr = cr_dst_address ^ WrAddrOffset;
reg [DATA_WIDTH-1:0] WrData;
always @(*)
begin
WrData = 'b0;
case(rbb_pointer) /* synthesis parallel_case */
'd0: WrData = rbbRdDout_b[DATA_WIDTH-1+DATA_WIDTH*0:DATA_WIDTH*0];
'd1: WrData = rbbRdDout_b[DATA_WIDTH-1+DATA_WIDTH*1:DATA_WIDTH*1];
'd2: WrData = rbbRdDout_b[DATA_WIDTH-1+DATA_WIDTH*2:DATA_WIDTH*2];
'd3: WrData = rbbRdDout_b[DATA_WIDTH-1+DATA_WIDTH*3:DATA_WIDTH*3];
endcase
end
wire [TXHDR_WIDTH-1:0] WrHdr = {
5'h00, // [60:56] Byte Enable
wrreq_type, // [55:52] Request Type
6'h00, // [51:46] Rsvd
WrAddr, // [45:14] Address
WrReqId // [13:0] Meta data to track the SPL requests
};
// Sending Requests
always @(posedge clk)
begin
if(!reset_n)
begin
afuid_updtd <= 'b0;
cf2ci_C1TxHdr <= 'b0;
cf2ci_C1TxWrValid <= 'b0;
cf2ci_C1TxData <= 'b0;
cf2ci_C0TxHdr <= 'b0;
cf2ci_C0TxRdValid <= 'b0;
end
else
begin
//Tx Path
//--------------------------------------------------------------------------
// Channel 1
if(ci2cf_C1TxAlmFull==0)
begin
//The first write request should be DSM initialization
if( ci2cf_InitDn && dsm_base_valid && !afuid_updtd )
begin
afuid_updtd <= 1;
cf2ci_C1TxHdr <= {
5'h0, // [60:56] Byte Enable
WrLine, // [55:52] Request Type
6'h00, // [51:46] Rsvd
ds_afuid_address, // [44:14] Address
14'h3ffe // [13:0] Meta data to track the SPL requests
};
cf2ci_C1TxWrValid <= 1;
cf2ci_C1TxData <= { 368'h0, // [512:144] Zeros
VERSION , // [143:128] Version #2
BWA_MEM_SW // [127:0] AFU ID
};
end
else if (re2xy_go) //Executing real tasks
begin
if( WrHdr_valid ) // Write to Destination Workspace
begin //-------------------------------------
cf2ci_C1TxHdr <= WrHdr;
cf2ci_C1TxWrValid <= 1'b1;
cf2ci_C1TxData <= WrData;
end
end // re2xy_go
end // C1_TxAmlFull
// Channel 0
if( re2xy_go
&& RdHdr_valid && !ci2cf_C0TxAlmFull ) // Read from Source Workspace
begin //----------------------------------
cf2ci_C0TxHdr <= RdHdr;
cf2ci_C0TxRdValid <= 1;
end
/* synthesis translate_off */
if(cf2ci_C1TxWrValid)
$display("*Req Type: %x \t Addr: %x \n Data: %x", cf2ci_C1TxHdr[55:52], cf2ci_C1TxHdr[45:14], cf2ci_C1TxData);
if(cf2ci_C0TxRdValid)
$display("*Req Type: %x \t Addr: %x", cf2ci_C0TxHdr[55:52], cf2ci_C0TxHdr[45:14]);
/* synthesis translate_on */
end
end
//-------------------------
//Handle Responses
//-------------------------
//We have already handled Cfg Responses in the Configuration Mode
//We do not need to care about Write Responses
//Only Read Responses are considered
always @(*)
begin
tbbWrEn_b = 'b0;
//tbbWrAddr_b = 'b0;
//tbbWrDin_b = 'b0;
case(rb2cf_C0RxHdr[13:TBB_WR_ADDR_WIDTH]) /* synthesis parallel_case */
'd0:
tbbWrEn_b[0] = re2xy_go && rb2cf_C0RxRdValid;
//tbbWrDin_b[0*TBB_WR_DATA_WIDTH+TBB_WR_DATA_WIDTH-1:0*TBB_WR_DATA_WIDTH] = rb2cf_C0RxData;
//tbbWrAddr_b[TBB_WR_ADDR_WIDTH*0+TBB_WR_ADDR_WIDTH-1:TBB_WR_ADDR_WIDTH*0] = rb2cf_C0RxHdr[TBB_WR_ADDR_WIDTH-1:0];
'd1:
tbbWrEn_b[1] = re2xy_go && rb2cf_C0RxRdValid;
//tbbWrDin_b[1*TBB_WR_DATA_WIDTH+TBB_WR_DATA_WIDTH-1:1*TBB_WR_DATA_WIDTH] = rb2cf_C0RxData;
//tbbWrAddr_b[TBB_WR_ADDR_WIDTH*1+TBB_WR_ADDR_WIDTH-1:TBB_WR_ADDR_WIDTH*1] = rb2cf_C0RxHdr[TBB_WR_ADDR_WIDTH-1:0];
'd2:
tbbWrEn_b[2] = re2xy_go && rb2cf_C0RxRdValid;
//tbbWrDin_b[2*TBB_WR_DATA_WIDTH+TBB_WR_DATA_WIDTH-1:2*TBB_WR_DATA_WIDTH] = rb2cf_C0RxData;
//tbbWrAddr_b[TBB_WR_ADDR_WIDTH*2+TBB_WR_ADDR_WIDTH-1:TBB_WR_ADDR_WIDTH*2] = rb2cf_C0RxHdr[TBB_WR_ADDR_WIDTH-1:0];
'd3:
tbbWrEn_b[3] = re2xy_go && rb2cf_C0RxRdValid;
//tbbWrDin_b[3*TBB_WR_DATA_WIDTH+TBB_WR_DATA_WIDTH-1:3*TBB_WR_DATA_WIDTH] = rb2cf_C0RxData;
//tbbWrAddr_b[TBB_WR_ADDR_WIDTH*3+TBB_WR_ADDR_WIDTH-1:TBB_WR_ADDR_WIDTH*3] = rb2cf_C0RxHdr[TBB_WR_ADDR_WIDTH-1:0];
endcase
end
// Function: Returns physical address for a DSM register
function automatic [31:0] dsm_offset2addr;
input [9:0] offset_b;
input [63:0] base_b;
begin
dsm_offset2addr = base_b[37:6] + offset_b[9:6];
end
endfunction
////----------------------------------------------------------------------------------------------------------------------------------------------
//// Instances
////----------------------------------------------------------------------------------------------------------------------------------------------
//-------------------------
//TBB Generation
//-------------------------
generate
genvar i;
for (i=0; i<NUM_PEA; i=i+1) begin
tbb #(.TBB_WR_ADDR_WIDTH(TBB_WR_ADDR_WIDTH),
.TBB_WR_DATA_WIDTH(TBB_WR_DATA_WIDTH),
.TBB_RD_ADDR_WIDTH(TBB_RD_ADDR_WIDTH),
.TBB_RD_DATA_WIDTH(TBB_RD_DATA_WIDTH)
)tbb(
.clk (clk),
.reset_n (reset_n),
.task_start (bm2pe_start_b[i]),
.task_done (pe2bm_done_b[i]),
.ReqValid (tbbReqValid_b[i]),
.ReqLineIdx (tbbReqLineIdx_b[i*TBB_WR_ADDR_WIDTH+TBB_WR_ADDR_WIDTH-1:i*TBB_WR_ADDR_WIDTH]),
.ReqAck (tbbReqAck_b[i]),
.WrEn (tbbWrEn_b[i]),
.WrAddr (rb2cf_C0RxHdr[TBB_WR_ADDR_WIDTH-1:0]),
//.WrAddr (tbbWrAddr_b[i*TBB_WR_ADDR_WIDTH+TBB_WR_ADDR_WIDTH-1:i*TBB_WR_ADDR_WIDTH]),
.WrDin (rb2cf_C0RxData),
//.WrDin (tbbWrDin_b[i*TBB_WR_DATA_WIDTH+TBB_WR_DATA_WIDTH-1:i*TBB_WR_DATA_WIDTH]),
.Full (tbbFull_b[i]),
.RdAddr (tbbRdAddr_b[i*TBB_RD_ADDR_WIDTH+TBB_RD_ADDR_WIDTH-1:i*TBB_RD_ADDR_WIDTH]),
.RdDout (tbbRdDout_b[i*TBB_RD_DATA_WIDTH+TBB_RD_DATA_WIDTH-1:i*TBB_RD_DATA_WIDTH]),
.Empty (tbbEmpty_b[i])
);
end
//-------------------------
//RBB Generation
//-------------------------
for (i=0; i<NUM_PEA; i=i+1) begin
rbb #(.RBB_ADDR_WIDTH(RBB_ADDR_WIDTH),
.RBB_DATA_WIDTH(RBB_DATA_WIDTH)
)rbb(
.clk (clk),
.reset_n (reset_n),
.task_done (pe2bm_done_b[i]),
.ReqValid (rbbReqValid_b[i]),
.ReqLineIdx (rbbReqLineIdx_b[i*RBB_ADDR_WIDTH+RBB_ADDR_WIDTH-1:i*RBB_ADDR_WIDTH]),
.ReqAck (rbbReqAck_b[i]),
.WrEn (rbbWrEn_b[i]),
.WrAddr (rbbWrAddr_b[i*RBB_ADDR_WIDTH+RBB_ADDR_WIDTH-1:i*RBB_ADDR_WIDTH]),
.WrDin (rbbWrDin_b[i*RBB_DATA_WIDTH+RBB_DATA_WIDTH-1:i*RBB_DATA_WIDTH]),
.Full (rbbFull_b[i]),
.RdAddr (rbbRdAddr_b[i*RBB_ADDR_WIDTH+RBB_ADDR_WIDTH-1:i*RBB_ADDR_WIDTH]),
.RdDout (rbbRdDout_b[i*RBB_DATA_WIDTH+RBB_DATA_WIDTH-1:i*RBB_DATA_WIDTH]),
.Empty (rbbEmpty_b[i])
);
end
endgenerate
endmodule