Change code to use block RAMs - verilog

I want to implement in Verilog the following Matlab code:
symBuf = [symBuf(numFFT/2+1:end); zeros(numFFT/2,1)];
symBuf(KF+(1:KF)) = symBuf(KF+(1:KF)) + txSymb;
It is a simple overlap and add operation.
Here is my implementation:
module overlap
#(K = 3,
FFT = 128
)
(
input signed [15:0] symbInReal ,
input signed [15:0] symbInImag ,
input clock ,
input reset ,
input readyIn ,
input validIn ,
input lastIn ,
output signed [15:0] outReal ,
output signed [15:0] outImag ,
output reg lastOut ,
output wire readyOut ,
output reg validOut
);
reg signed [15:0] previousSymbolReal [2*FFT*K-1:0] ;
reg signed [15:0] previousSymbolImag [2*FFT*K-1:0] ;
reg signed [15:0] txSymbolBuffReal [K*FFT-1:0] ;
reg signed [15:0] txSymbolBuffImag [K*FFT-1:0] ;
reg [15:0] counter ;
reg [1:0] state ;
reg [3:0] nextstate ;
reg [15:0] clockcount ;
reg signed [15:0] outputValueReal ;
reg signed [15:0] outputValueImag ;
reg [15:0] buffcount ;
reg [7:0] symboutcount ;
reg [7:0] symbincount ;
reg last ;
reg lastvalidout ;
wire lastout ;
integer i;
initial begin
for (i=0; i<2*FFT*K ; i = i + 1) begin
previousSymbolReal[i] = 0;
previousSymbolImag[i] = 0;
end
end
always#(posedge clock) begin
if(~reset) begin
counter <= 0;
end else begin
counter <= counter +1;
if(nextstate != state)
counter <= 0;
end
end
always#(*) begin
if(~reset) begin
nextstate = 0;
end else begin
nextstate = state;
if(readyIn) begin
case(state)
4'd0: begin
if(validIn || last) begin
nextstate = 1;
end
end
4'd1: begin
if (counter == (FFT*K-2)) begin
nextstate = 2;
end
end
4'd2: begin
nextstate = 0;
end
endcase
end
end
end
always#(posedge clock) begin
if(~reset) begin
state <= 0;
end else begin
if(readyIn)
state <= nextstate;
end
end
always#(posedge clock) begin
if(~reset) begin
clockcount <= 0;
symboutcount <= 0;
lastOut <= 0;
end else begin
if(readyIn) begin
clockcount <= clockcount +1 ;
case(state)
4'd0: begin
validOut <= 0;
clockcount <= 0;
lastOut <= 0;
end
4'd1: begin
if(~lastvalidout)
validOut <= 1;
outputValueReal <= previousSymbolReal[clockcount+ FFT/2];
outputValueImag <= previousSymbolImag[clockcount+ FFT/2];
end
4'd2: begin
outputValueReal <= previousSymbolReal[clockcount + FFT/2];
outputValueImag <= previousSymbolImag[clockcount + FFT/2];
clockcount <= 0;
if(~lastvalidout)
validOut <= 1;
if(symboutcount == symbincount + 1 && last)
lastOut <= 1;
symboutcount <= symboutcount +1 ;
end
endcase
end
end
end
assign readyOut = readyIn;
genvar M;
generate
for(M=0;M<K*FFT;M=M+1) begin
always#(posedge clock) begin
if(state==2) begin
previousSymbolReal[M] <= previousSymbolReal[M+FFT/2];
previousSymbolImag[M] <= previousSymbolImag[M+FFT/2];
end
end
end
for(M=K*FFT;M<2*K*FFT-FFT/2;M=M+1) begin
always#(posedge clock) begin
if(state==2) begin
previousSymbolReal[M] <= previousSymbolReal[M+FFT/2]+txSymbolBuffReal[M-K*FFT];
previousSymbolImag[M] <= previousSymbolImag[M+FFT/2]+txSymbolBuffImag[M-K*FFT];
end
end
end
for(M=2*K*FFT-FFT/2;M<2*K*FFT;M=M+1) begin
always#(posedge clock) begin
if(state==2) begin
previousSymbolReal[M] <= txSymbolBuffReal[M-K*FFT];
previousSymbolImag[M] <= txSymbolBuffImag[M-K*FFT];
end
end
end
endgenerate
always#(posedge clock) begin
if(~reset) begin
buffcount <= 0;
symbincount <= 0;
last <= 0;
end else begin
if(validIn) begin
txSymbolBuffReal[buffcount] <= symbInReal;
txSymbolBuffImag[buffcount] <= symbInImag;
buffcount <= buffcount +1;
if(buffcount == K*FFT-1) begin
symbincount <= symbincount + 1;
buffcount <= 0;
end
if(lastIn)
last <= 1;
end
end
end
always#(posedge clock) begin
if(~reset)
lastvalidout <= 0;
else begin
if(last && lastOut)
lastvalidout <= 1;
end
end
assign outReal = outputValueReal;
assign outImag = outputValueImag;
endmodule
The problem here is that I have 4 huge arrays which take up to 4 times what is available in my FPGA.
Hence, I want to be able to use block RAMs. However, I don't think it's possible due to the number of read and write operations performed.
Does anyone have a solution for this?

However, I don't think it's possible due to the number of read and write operations performed.
Correct. At least, not without major changes to your design.
A typical block RAM element can only read or write one (or sometimes two) values per clock cycle, but your generate loops are trying to update every element in the RAM at once!
To make this operation use a block RAM, you will need to implement a state machine to update one element per clock cycle, and to sequence operations such that other states wait until the updates have completed.
If you want to accelerate this, you may be able to split the array into multiple block RAMs so that multiple values can be updated in parallel. (You will need to carefully consider which elements need to be read/written to avoid conflicts.)

Related

Minimum in array

I am trying to build a module for practice which gets as input:
clk, rst.
an array of 3 numbers each one is 4 bits.
The final purpose is to return in the output the minimum value in the array but I can't get it really working and I don't understand why.
module minVal(
input logic rstN, clk,
input logic unsigned [2:0][3:0] resArray,
output logic [3:0] minVal
);
logic unsigned [3:0] currRes;
always_ff #(posedge clk, posedge rstN) begin
if(rstN == 1'b1) begin
currRes <= 4'b1111;
end
else begin
for(int i = 0; i < 3; i++) begin
if( resArray[i] < currRes) begin
currRes <= resArray[i];
minVal <= resArray[i];
end
end
end
end
endmodule
I wrote the following test bench:
module minVal_tb();
logic rstN, clk;
logic unsigned [2:0][3:0] resArray;
logic [3:0] minVal;
minVal thisInst(.rstN(rstN), .clk(clk), .resArray(resArray), .minVal(minVal));
always begin
#20 clk = ~clk;
end
initial begin
clk = 1'b0;
rstN = 1'b1;
resArray[0] = 5;
resArray[1] = 1;
resArray[2] = 3;
#5 rstN = 1'b0;
end
endmodule
I expect the output to be 1 right after the first clock cycle, but I get it only after 2 clock cycles. Why?
When you unroll the for loop, this is what it would look like:
always_ff #(posedge clk, posedge rstN) begin
if(rstN == 1'b1) begin
currRes <= 4'b1111;
end
else begin
if( resArray[0] < currRes) begin
currRes <= resArray[0];
minVal <= resArray[0];
end
if( resArray[1] < currRes) begin
currRes <= resArray[1];
minVal <= resArray[1];
end
if( resArray[2] < currRes) begin
currRes <= resArray[2];
minVal <= resArray[2];
end
end
end
You end up with multiple nonblocking assignments to the same register (currRes).
On the 1st posedge of the clock after reset, all 3 if clauses are true, and the last assignment wins:
currRes <= resArray[2];
So, currRes is assigned the value 3, not 1.
The same is true of minVal.
You need to sort the 3 input values, then compare the minimum of those values to the current minimum.

Can't resolve multiple constant drivers for net Quartus error

this is a very basic question but if somebody can help me with these errors, I'd really appreciate it. I am an EE undergrad, new to Verilog, and I'd appreciate any help/explanation.
The errors I received were:
10028 Can't resolve multiple constant drivers for "led" at compare_block.sv (66)
10029 Constant driver at compare_block.sv(61)
What are the multiple constant drivers here? Originally, I had the led assignment (led <= led_val) within the second always block, and I thought moving it up to the first always block would change the errors, but it did not.
module compare_block (clk, reset_n, result, led);
parameter data_width = 8; /
parameter size = 1000;
input clk, reset_n;
input [data_width:0] result;
logic [(data_width):0] data_from_rom;
logic [9:0] addr_to_rom;
output reg led;
reg [(data_width):0] comp_sig [size-1:0];
reg [data_width:0] comp_sig_temp;
reg [(data_width):0] filt_sig [size-1:0];
reg [(data_width):0] ans_sig [size-1:0];
integer i, iii;
reg [9:0] ii ;
integer sum_array;
wire [(data_width+3):0] array_avg;
reg [data_width:0] summed_arr [size-2 : 0];//container for all summation steps
wire total_sum;
reg ans_sig_done;
reg [data_width : 0] summed_ans;
reg [data_width:0] max_val, error_val;
initial begin
i = 0;
ii = 0;
led=0;
data_from_rom='b000000000;
summed_ans='b000000000;
max_val='b000000000;
for (iii=0;iii<(size-1);iii=iii+1) begin
filt_sig[iii]=0;
ans_sig [iii]=0;
comp_sig[iii]=0;
summed_arr[iii]=0;
end
end
//port map to ROM
rom_compare ref_wave(
.clk(clk),
.addr(addr_to_rom), //text file?
.data(data_from_rom)
);
//Moore FSM
localparam [3:0]
s0=0, s1 = 1, s2 = 2, s3 = 3, s4 = 4, s5 = 5;
reg [3:0] state_reg, state_next;
initial state_next = 0;
always #(posedge clk, negedge reset_n) begin
if (reset_n == 'b0) begin //reset is active low
state_reg <= s0;
end else begin
state_reg <= state_next;
led <= led_val;
end
end
always #(state_reg) begin
state_next = state_reg;
led=0;
case (state_reg)
s0 : begin //initial state, reset state
if (!reset_n) begin
led <= 0;
ii <= 0;
end else begin
state_next <= s1;
end
end
s1 : begin
if (ii>(size)) begin
ii <= 0;
end else begin
addr_to_rom <= ii;
state_next <= s2;
end
end
s2 : begin
comp_sig_temp <= data_from_rom;
filt_sig [ii] <= result;
state_next <= s3;
end
s3 : begin
comp_sig[ii] <= comp_sig_temp;
state_next <= s4;
end
s4 : begin
ans_sig[ii] <= filt_sig[ii] - comp_sig[ii];
state_next <= s5;
end
s5 : begin
if (ii>(size-2)) begin
ans_sig_done = 1;
end else begin
ans_sig_done = 0;
end
ii <= ii+1;
state_next <= s0;
end
endcase
end
reg [(data_width+2):0] sum;
integer j;
always #* begin
sum = 0;
if (ans_sig_done == 1) begin
for (j=4; j<(size-1); j=j+2) begin
sum = sum +ans_sig[j];
if (ans_sig[j] > max_val) begin
max_val = ans_sig[j];
end
end
end
end
assign array_avg = sum >> 'd3; //2^3 = 8
always #(clk, result) begin //posedge clk, result
filt_sig [i] <= result;
i <= i + 1;
end
assign error_val = max_val >> 'd2; //approx 25% of max value of array
reg led_val;
always #(*)
begin
if (array_avg < error_val) begin
led_val <= 'b1;
end else begin
led_val <= 'b0;
end
end
endmodule
I figured it out!
The two instances were me trying to initialize one led = 0 in one always block and then assigning led <= led_val in another. You can't refer to an output in two different always blocks.

Module instantiation with the "number sign"

I have the main module with FIFO stuff.
Here it is:
module syn_fifo #(
parameter DATA_WIDTH = 8, // inpit capacity
parameter DATA_DEPTH = 8 // the depth of the FIFO
)
(
input wire clk,
input wire rst,
// Write_______________________________________________
input wire [DATA_WIDTH-1:0]din, // the input data
input wire wren, // Write anable
output wire full,
// Read________________________________________________
output wire [DATA_WIDTH-1:0]dout, // The output data
input wire rden, // Read enable
output wire empty
);
integer q_size; // The queue size(length)
integer golova; // The queue beginning
integer hvost; // The end of queue
reg [DATA_WIDTH-1:0]fifo[DATA_DEPTH-1:0];
assign full = (q_size == DATA_DEPTH) ? 1'b1: 1'b0; // FIFO is full
/*
True { full = (q_size==DATA_TEPTH) = 1 }, then wire "full" goes to "1" value
False { full = (q_size==DATA_TEPTH) = 0 }, then wire "full" goes to "0" value
*/
assign empty = (golova == hvost); // FIFO is empty
assign dout = fifo[hvost]; // FWFT (other write mode)
integer i;
//___________(The queue fullness)___________________
always #(posedge clk or posedge rst)
begin
if (rst == 1'b1)
begin
for (i = 0; i < DATA_DEPTH; i = i + 1) // incrementing the FIFO
fifo[i] <= 0; // Resetting the FIFO
golova <= 0; // Resetting the queue start variable
end
else
begin //Write_______________________________________
if (wren && ~full)
begin
fifo[golova] <= din; // putting data in to the golova
if (golova == DATA_DEPTH-1) // restrictions for the queue beginning
golova <= 0; // Reset the beginning
else
golova <= golova + 1; // other occurence incrementing
end
end
end
//Reading
always #(posedge clk or posedge rst)
begin
if (rst == 1'b1)
begin
hvost <= 0;
end
else
begin
if (rden && !empty)
/*for staying inside the queue limits - make the check of non equality of the "hvost" & "queue size"*/
begin
if (hvost == DATA_DEPTH-1) // if hvost = DATA_DEPTH-1, then
hvost <= 0; // Reset hvost
else
hvost <= hvost + 1;
end
end
end
always # (posedge clk)
begin
if (rst == 1'b1) begin
q_size <= 0;
end
else
begin
case ({wren && ~full, rden && ~empty} )
2'b01: q_size <= q_size + 1; // RO
2'b10: q_size <= q_size - 1; // WO
default: q_size <= q_size; // read and write at the same time
endcase
end
end
endmodule
Also i've got the testbench module down delow:
`timescale 1ns / 1ps
module fifo_tb();
localparam CLK_PERIOD = 10;
reg clk;
reg rst;
always begin
clk <= 1'b0;
#(CLK_PERIOD / 2);
clk <= 1'b1;
#(CLK_PERIOD / 2);
end
localparam DATA_WIDTH = 8;
localparam DATA_DEPTH = 4;
reg [DATA_WIDTH-1:0]din;
reg wren;
reg rden;
wire [DATA_WIDTH-1:0]dout;
wire empty;
wire full;
wire wr_valid;
wire rd_valid;
task write;
input integer length;
begin
if (length) begin
#(posedge clk);
wren <= 1'b1;
while (length) begin
#(posedge clk);
if (wr_valid) begin
length <= length - 1;
if (length == 1) begin
wren <= 1'b0;
end
end
end
end
end
endtask
task read;
input integer length;
begin
if (length) begin
#(posedge clk);
rden <= 1'b1;
while (length) begin
#(posedge clk);
if (rd_valid) begin
length <= length - 1;
if (length == 1) begin
rden <= 1'b0;
end
end
end
end
end
endtask
initial begin
rst <= 1'b0;
wren <= 1'b0;
rden <= 1'b0;
#50;
rst <= 1'b1;
#50;
rst <= 1'b0;
#200;
/* Test Start */
//write(4);
//read(4);
/* Test Stop */
#1000;
$finish;
end
assign wr_valid = wren & ~full;
assign rd_valid = rden & ~empty;
always #(posedge clk) begin
if (rst == 1'b1) begin
din <= 0;
end else begin
if (wr_valid == 1'b1) begin
din <= din + 1;
end
end
end
// write?
always begin
#400;
write(5);
#15;
write(7);
#25;
write(3);
#15;
write(9);
#15;
write(1);
#10000;
end
// read?
always begin
#420;
read(3);
#37;
read(13);
#21;
read(7);
#15;
read(9);
#15;
read(4);
#20;
read(7);
#10000;
end
initial begin
$dumpfile("test.vcd");
$dumpvars(0,fifo_tb);
end
syn_fifo #(.DATA_WIDTH(DATA_WIDTH),
.DATA_DEPTH(DATA_DEPTH)) dut ( .clk(clk),
.rst(rst),
.din(din),
.wren(wren),
.full(full),
.dout(dout),
.rden(rden),
.empty(empty));
endmodule
Trying to compile all of it with iVerilog + GTKwave + Win10 by next command:
C:\Program Files\iverilog\bin>iverilog -o fifo.v fifo_tb.v
The compiler gives me the next message:
fifo_tb.v:138:error: Unknown module type:syn_fifo
2 error(s) during elaboration.
These modules were missing:syn_fifo referenced 1 times
At the necessary line "138" maybe the main mistake is covered by the "Number sign" in module instantiation?
/*132|*/ initial begin
/*133|*/ $dumpfile("test.vcd");
/*134|*/ $dumpvars(0,fifo_tb);
/*135|*/ end
/*136|*/
/*137|*/ syn_fifo #(.DATA_WIDTH(DATA_WIDTH),
/*138|*/ .DATA_DEPTH(DATA_DEPTH)) dut ( .clk(clk),
/*139|*/ .rst(rst),
/*140|*/ .din(din),
/*141|*/ .wren(wren),
/*142|*/ .full(full),
/*143|*/ .dout(dout),
/*144|*/ .rden(rden),
/*145|*/ .empty(empty));
/*146|*/
/*147|*/ endmodule
I'm not shure of that.
Seems like you are indicating fifo.v to be your output file, try:
iverilog -o syn_fifo.tb -s fifo_tb fifo_tb.v fifo.v
-o -> output file
-s -> top module (in this case, the test one)
(after everything, include all the files)
Then, to run it:
vvp syn_fifo.tb
Thank you, dear #m4j0rt0m
I just forgot to type in the output file name at the CMD window. Was very exhausted so haven't noticed such a detail)))
Usually it looks like:
iverilog -o OUTPUT_FILE_NAME fifo_tb.v fifo.v
And also I tried your advice, and it's finally done!

Making Vivado Synthesis "A process triggered every clock cycle will not have functionality every clock cycle"

This is code for ALU that does addition and multiplication only. An addition is handled in same clock cycle but the multiplication result has to be delayed by 3 clock cycles.
module my_addmul(
//control signals
input i_clk,
input i_rst,
input i_en,
//add=01, mul=10
input [1:0] i_op,
//input and output registers
input [31:0] i_A,
input [31:0] i_B,
output [31:0] o_D,
//to signal if output is valid
output o_done
);
//registers to save output
reg [31:0] r_D;
reg [63:0] r_mul;//(*keep="true"*)
reg r_mul_done;
reg r_mul_done2;
reg r_done;
//updating outputs
assign o_D = r_D;
assign o_done = r_done;
always # (posedge i_clk)
begin
r_done <= 0;
r_mul_done <= 0;
if (i_rst) begin
r_D <= 0;
r_mul <= 0;
r_mul_done <= 0;
r_mul_done2 <= 0;
end else if (i_clk == 1) begin
if (i_en == 1) begin
//addition - assignment directly to OP registers
if (i_op == 01) begin
r_done <= 1;
r_D <= i_A + i_B;
//multiplication - indirect assignment to OP registers
end else if (i_op == 2'b10) begin
r_mul <= i_A * i_B;
r_mul_done <= 1;
end
end
//1-clock cycle delay
r_mul_done2 <= (r_mul_done == 1) ? 1 : 0;
//updating outputs in the 3rd cycle
if (r_mul_done2 == 1) begin
r_D <= r_mul[31:0];
r_done <= 1;
end
end
end
endmodule
The problem is that if the keep attribute is not used, the r_mul register that stores the multiplication output until 3rd clock cycle is optimised out. I read on the problem and realised that Vivado is thinking like this: "If the multiplication happens every clock cycle, the r_mul is over-written before it is sent to output. Therefore, it is a register being written but not read, Lets remove it!" Since I insert the 3 clock cycle wait in test bench, the simulation result is always accurate. I want to know what is the "Proper" way of doing this so I don't have to use the keep attribute. It is an ok solution but I think useful techniques should be learned so hacks don't have to be used. Any ideas or discussion welcome.
If I want to delay a signal, I'd probably insert flops for that. You can probably flop your mul_output like the way you do for the mul_done signal. Also, it is better to have different always blocks for doing the same. You can check the code below but it might be buggy since I haven't simulated/synthesized it -
module my_addmul(
//control signals
input i_clk,
input i_rst,
input i_en,
//add=01, mul=10
input [1:0] i_op,
//input and output registers
input [31:0] i_A,
input [31:0] i_B,
output [31:0] o_D,
//to signal if output is valid
output o_done
);
//registers to save output
reg [31:0] r_D;
reg [63:0] r_mul;//(*keep="true"*)
reg r_mul_1;
reg r_mul_2;
reg r_mul_done;
reg r_mul_done2;
reg r_done;
//updating outputs
assign o_D = r_D;
assign o_done = r_done;
always # (posedge i_clk)
begin
r_done <= 0;
r_mul_done <= 0;
if (i_rst) begin
r_D <= 0;
r_mul <= 0;
r_mul_done <= 0;
r_mul_done2 <= 0;
end else if (i_clk == 1) begin
if (i_en == 1) begin
//addition - assignment directly to OP registers
if (i_op == 01) begin
r_done <= 1;
r_D <= i_A + i_B;
//multiplication - indirect assignment to OP registers
end else if (i_op == 2'b10) begin
r_mul <= i_A * i_B;
r_mul_done <= 1;
end
end
end
end
always # (posedge i_clk)
begin
if (i_rst)
begin
r_mul_1 <= 0;
r_mul_done2 <= 0;
end
else
begin
r_mul_1 <= r_mul;
r_mul_done2 <= r_mul_done;
end
end
always # (posedge i_clk)
begin
if (i_rst)
begin
r_D <= 0;
r_done <= 0;
end
else
begin
r_D <= r_mul_1;
r_done <= r_mul_done2;
end
end
endmodule

How to execute a for loop over multiple clock cycles?

The for loop is working properly , but everything is happening in a single clock cycle. How I make it run a single iteration per cycle?
`timescale 1ns/10ps
module multiplier (clock,multiplier,multiplicand,start,done,product);
input [7:0] multiplier ,multiplicand;
input start;
input clock;
output [15:0] product;
output done;
reg [15:0] multiplierF ,multiplicandF;
reg [15:0] productF;
reg doneF;
integer i;
assign product = productF;
assign done = doneF;
task rpa_16;
input [15:0] multiplierF;
inout [15:0] productF;
assign productF = multiplierF + productF;
endtask
always # (posedge clock or posedge start)
begin
if(start)
begin
multiplierF = 0 ;
multiplicandF = 0 ;
productF = 0 ;
doneF = 0 ;
end
else
begin
multiplierF = {8'b0,multiplier};
multiplicandF = {8'b0,multiplicand};
end
end
always #(posedge clk)
begin
if(!doneF)
begin
for (i=0;i<7;i=i+1)
begin
if(multiplicand[i])
begin
rpa_16(multiplierF,productF);
multiplierF = multiplierF << 1;
productF = productF;
end
else
begin
multiplierF = multiplierF << 1;
end
end
doneF = 1;
end
end
I cant paste a picture of the waveform , but I want i to increment after each positive edge . But whats happening is , for loop executes in a single clock cycle and I get the output.
For loops do not imply anything sequential in verilog. If you want a loop that takes 8 clock cycles, then you'll have to rewrite it with an explicit counter variable, perhaps something like this:
always #(posedge clk or negedge reset_)
if(!reset_) begin
multiplierF <= 0;
loopcount <= 0;
doneF <= 0;
end else begin
if(!doneF) begin
loopcount <= loopcount + 1;
if(multiplicand[loopcount]) begin
rpa_16(multiplierF,productF);
multiplierF = multiplierF << 1;
productF = productF;
end else begin
multiplierF = multiplierF << 1;
end
end
if(loopcount == 7) doneF <= 1;
end
Also, you should not be assigning variables like multiplierF in multiple always blocks, you will get non-deterministic behavior and will likely fail to synthesize. On posedge clk both blocks will execute and you cannot know which one will execute last, so it may give different results on different simulators.

Resources