(一)资源优化
1 DSP优化
创建优化的DSP映射
创建文件 dsp_optimized_pe.v
:
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// 优化的PE模块 - 直接使用DSP48E2原语
//////////////////////////////////////////////////////////////////////////////////module dsp_optimized_pe #(parameter USE_DSP_PACKING = 1 // 使用DSP打包优化
)(input wire clk,input wire rst_n,input wire ce,// 双精度输入支持INT8打包input wire signed [7:0] a0, a1, // 两个激活值input wire signed [7:0] b0, b1, // 两个权重input wire signed [31:0] c_in, // 累加输入output reg signed [31:0] p_out // 结果输出
);generateif (USE_DSP_PACKING) begin : gen_packed_dsp// 使用单个DSP48E2实现2个INT8 MAC// 打包输入到更宽的信号wire signed [26:0] a_packed;wire signed [17:0] b_packed;// A端口打包: [空闲位][a1][隔离0s][a0]assign a_packed = {3'b0, a1, 8'b0, a0};// B端口打包: [b1][隔离0s][b0] assign b_packed = {1'b0, b1, 1'b0, b0};// DSP48E2原语实例化wire [47:0] dsp_p;DSP48E2 #(// 特性配置.A_INPUT("DIRECT"),.B_INPUT("DIRECT"),.USE_MULT("MULTIPLY"),.USE_PATTERN_DETECT("NO_PATDET"),.USE_SIMD("TWO12"), // 关键:SIMD模式用于并行计算// 寄存器配置.ACASCREG(1),.ADREG(1),.ALUMODEREG(1),.AREG(1),.AUTORESET_PATDET("NO_RESET"),.BCASCREG(1),.BREG(1),.CARRYINREG(1),.CARRYINSELREG(1),.CREG(1),.DREG(1),.INMODEREG(1),.MREG(1),.OPMODEREG(1),.PREG(1)) DSP48E2_inst (// 时钟和控制.CLK(clk),.CE(ce),.RSTA(~rst_n),.RSTB(~rst_n),.RSTC(~rst_n),.RSTM(~rst_n),.RSTP(~rst_n),// 数据输入.A({3'b0, a_packed}), // 30位A输入.B(b_packed), // 18位B输入.C({16'b0, c_in}), // 48位C输入(累加)// 控制输入.OPMODE(9'b000110101), // C + A*B.ALUMODE(4'b0000), // ADD.INMODE(5'b00000),.CARRYINSEL(3'b000),// 级联(未使用).ACIN(30'b0),.BCIN(18'b0),.PCIN(48'b0),.CARRYIN(1'b0),// 动态控制(未使用).D(27'b0),.CEA1(1'b0),.CEA2(1'b1),.CEB1(1'b0),.CEB2(1'b1),// 输出.P(dsp_p),// 未使用的输出.ACOUT(),.BCOUT(),.PCOUT(),.CARRYOUT(),.PATTERNDETECT(),.PATTERNBDETECT(),.OVERFLOW(),.UNDERFLOW());// 提取并组合结果always @(posedge clk) beginif (!rst_n) beginp_out <= 32'b0;end else if (ce) begin// 从DSP输出提取两个MAC结果并相加p_out <= dsp_p[15:0] + dsp_p[31:16] + c_in;endendend else begin : gen_standard_dsp// 标准实现(编译器推断)reg signed [15:0] prod0, prod1;reg signed [31:0] sum;always @(posedge clk) beginif (!rst_n) beginprod0 <= 16'b0;prod1 <= 16'b0;sum <= 32'b0;p_out <= 32'b0;end else if (ce) begin// 两个独立的乘法prod0 <= a0 * b0;prod1 <= a1 * b1;// 累加sum <= prod0 + prod1 + c_in;p_out <= sum;endendend
endgenerateendmodule
创建DSP资源监控模块
创建文件 dsp_monitor.v
:
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// DSP资源使用监控器
//////////////////////////////////////////////////////////////////////////////////module dsp_monitor #(parameter NUM_DSPS = 64,parameter MONITOR_PERIOD = 1000 // 监控周期(时钟周期数)
)(input wire clk,input wire rst_n,input wire enable,// DSP活动信号input wire [NUM_DSPS-1:0] dsp_active,// 监控输出output reg [31:0] total_cycles,output reg [31:0] active_cycles [0:NUM_DSPS-1],output reg [7:0] utilization_percent,output reg [31:0] peak_usage,output reg [31:0] average_usage
);// 内部计数器reg [31:0] period_counter;reg [31:0] active_count;reg [31:0] usage_accumulator;integer i;always @(posedge clk or negedge rst_n) beginif (!rst_n) begintotal_cycles <= 0;utilization_percent <= 0;peak_usage <= 0;average_usage <= 0;period_counter <= 0;active_count <= 0;usage_accumulator <= 0;for (i = 0; i < NUM_DSPS; i = i + 1) beginactive_cycles[i] <= 0;endend else if (enable) begin// 总周期计数total_cycles <= total_cycles + 1;period_counter <= period_counter + 1;// 统计每个DSP的活动周期for (i = 0; i < NUM_DSPS; i = i + 1) beginif (dsp_active[i]) beginactive_cycles[i] <= active_cycles[i] + 1;active_count <= active_count + 1;endend// 累积使用量usage_accumulator <= usage_accumulator + active_count;// 更新峰值使用if (active_count > peak_usage) beginpeak_usage <= active_count;end// 周期性计算利用率if (period_counter >= MONITOR_PERIOD) beginaverage_usage <= usage_accumulator / MONITOR_PERIOD;utilization_percent <= (average_usage * 100) / NUM_DSPS;// 重置周期计数器period_counter <= 0;usage_accumulator <= 0;end// 重置活动计数active_count <= 0;endend// 生成利用率报告(仿真用)`ifdef SIMULATIONalways @(posedge clk) beginif (period_counter == MONITOR_PERIOD - 1) begin$display("DSP利用率报告 @%0t:", $time);$display(" 平均使用: %0d/%0d DSPs", average_usage, NUM_DSPS);$display(" 利用率: %0d%%", utilization_percent);$display(" 峰值使用: %0d DSPs", peak_usage);endend`endifendmodule
2存储优化
创建高效的存储管理器
创建文件 memory_manager.v
:
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// 智能存储管理器 - 优化BRAM/URAM使用
//////////////////////////////////////////////////////////////////////////////////module memory_manager #(parameter DATA_WIDTH = 8,parameter ADDR_WIDTH = 10,parameter USE_URAM = 1, // 1=URAM, 0=BRAMparameter DOUBLE_BUFFER = 1, // 使用双缓冲parameter PREFETCH = 1 // 启用预取
)(input wire clk,input wire rst_n,// 写接口input wire wr_en,input wire [ADDR_WIDTH-1:0] wr_addr,input wire [DATA_WIDTH-1:0] wr_data,// 读接口input wire rd_en,input wire [ADDR_WIDTH-1:0] rd_addr,output reg [DATA_WIDTH-1:0] rd_data,output reg rd_valid,// 控制input wire buffer_swap, // 交换缓冲区output reg buffer_ready, // 缓冲区准备好// 性能监控output reg [31:0] rd_hits, // 读命中次数output reg [31:0] rd_misses // 读未命中次数
);// 存储器实例localparam MEM_DEPTH = 2**ADDR_WIDTH;generateif (USE_URAM) begin : gen_uram// 使用URAM(288Kb块)(* ram_style = "ultra" *)reg [71:0] uram_array_a [0:4095];reg [71:0] uram_array_b [0:4095];// URAM需要72位宽,所以需要适配wire [71:0] uram_wr_data = {64'b0, wr_data};wire [11:0] uram_addr = {2'b0, wr_addr};always @(posedge clk) beginif (wr_en) beginif (!buffer_swap || !DOUBLE_BUFFER) beginuram_array_a[uram_addr] <= uram_wr_data;end else beginuram_array_b[uram_addr] <= uram_wr_data;endendif (rd_en) beginif (!buffer_swap || !DOUBLE_BUFFER) beginrd_data <= uram_array_a[{2'b0, rd_addr}][DATA_WIDTH-1:0];end else beginrd_data <= uram_array_b[{2'b0, rd_addr}][DATA_WIDTH-1:0];endendendend else begin : gen_bram// 使用BRAM(36Kb块)(* ram_style = "block" *)reg [DATA_WIDTH-1:0] bram_array_a [0:MEM_DEPTH-1];reg [DATA_WIDTH-1:0] bram_array_b [0:MEM_DEPTH-1];always @(posedge clk) beginif (wr_en) beginif (!buffer_swap || !DOUBLE_BUFFER) beginbram_array_a[wr_addr] <= wr_data;end else beginbram_array_b[wr_addr] <= wr_data;endendif (rd_en) beginif (!buffer_swap || !DOUBLE_BUFFER) beginrd_data <= bram_array_a[rd_addr];end else beginrd_data <= bram_array_b[rd_addr];endendendendendgenerate// 预取逻辑generateif (PREFETCH) begin : gen_prefetchreg [DATA_WIDTH-1:0] prefetch_buffer [0:3];reg [ADDR_WIDTH-1:0] prefetch_addr;reg [1:0] prefetch_valid;always @(posedge clk or negedge rst_n) beginif (!rst_n) beginprefetch_addr <= 0;prefetch_valid <= 0;end else if (rd_en) begin// 预取下一个地址prefetch_addr <= rd_addr + 1;// 检查预取命中if (rd_addr == prefetch_addr && prefetch_valid[0]) beginrd_hits <= rd_hits + 1;rd_data <= prefetch_buffer[0];end else beginrd_misses <= rd_misses + 1;end// 更新预取缓冲prefetch_valid <= {prefetch_valid[0], 1'b1};endendendendgenerate// 读有效信号生成always @(posedge clk or negedge rst_n) beginif (!rst_n) beginrd_valid <= 1'b0;end else beginrd_valid <= rd_en;endend// 缓冲区状态管理always @(posedge clk or negedge rst_n) beginif (!rst_n) beginbuffer_ready <= 1'b0;end else beginbuffer_ready <= 1'b1; // 简化:总是准备好endend// 性能计数器always @(posedge clk or negedge rst_n) beginif (!rst_n) beginrd_hits <= 0;rd_misses <= 0;end// 计数逻辑在预取部分实现endendmodule
(二)PS系统集成
创建Block Design
- 将卷积引擎打包为IP:
创建文件 package_conv_engine.tcl
:
###############################################
# 将卷积引擎打包为IP核
################################################ 设置IP打包项目路径
set ip_proj_dir "./ip_repo/conv_engine_ip"# 创建IP打包项目
create_project conv_engine_ip $ip_proj_dir -part xczu9eg-ffvb1156-2-e -force
set_property board_part xilinx.com:zcu102:part0:3.4 [current_project]# 添加源文件
add_files -norecurse {./src/hdl/conv_engine/conv_engine_top.v./src/hdl/conv_engine/systolic_array.v./src/hdl/conv_engine/systolic_controller.v./src/hdl/primitives/processing_element.v./src/hdl/primitives/dsp_optimized_pe.v./src/hdl/utilities/memory_manager.v
}# 设置顶层
set_property top conv_engine_top [current_fileset]# 打包IP
ipx::package_project -root_dir $ip_proj_dir -vendor user.org \-library user -name conv_engine -taxonomy /UserIP# 设置IP核心属性
set_property vendor_display_name {YOLO V10 Conv Engine} [ipx::current_core]
set_property display_name {Convolution Engine for YOLO V10} [ipx::current_core]
set_property description {High-performance systolic array based convolution engine optimized for YOLO V10} [ipx::current_core]
set_property company_url {http://www.example.com} [ipx::current_core]
set_property supported_families {zynquplus Production} [ipx::current_core]
set_property version 1.0 [ipx::current_core]# 自动推断接口
ipx::infer_bus_interface clk xilinx.com:signal:clock_rtl:1.0 [ipx::current_core]
ipx::infer_bus_interface rst_n xilinx.com:signal:reset_rtl:1.0 [ipx::current_core]# 配置AXI接口
ipx::infer_bus_interface s_axi_awaddr xilinx.com:interface:aximm_rtl:1.0 [ipx::current_core]
ipx::infer_bus_interface s_axis_tdata xilinx.com:interface:axis_rtl:1.0 [ipx::current_core]
ipx::infer_bus_interface m_axis_tdata xilinx.com:interface:axis_rtl:1.0 [ipx::current_core]# 关联时钟
ipx::associate_bus_interfaces -busif s_axi -clock clk [ipx::current_core]
ipx::associate_bus_interfaces -busif s_axis -clock clk [ipx::current_core]
ipx::associate_bus_interfaces -busif m_axis -clock clk [ipx::current_core]# 设置存储器映射
ipx::add_memory_map s_axi [ipx::current_core]
set_property slave_memory_map_ref s_axi [ipx::get_bus_interfaces s_axi -of_objects [ipx::current_core]]ipx::add_address_block axi_lite_regs [ipx::get_memory_maps s_axi -of_objects [ipx::current_core]]
set_property range 4096 [ipx::get_address_blocks axi_lite_regs \-of_objects [ipx::get_memory_maps s_axi -of_objects [ipx::current_core]]]# 生成示例驱动
set_property driver_strength strong [ipx::current_core]
set_property auto_family_support_level optimized [ipx::current_core]# 创建GUI定制页面
ipgui::add_page -name {Basic} -component [ipx::current_core] \-display_name {Basic Configuration}
ipgui::add_param -name {ARRAY_SIZE} -component [ipx::current_core] \-parent [ipgui::get_pagespec -name Basic -component [ipx::current_core]]
ipgui::add_param -name {DATA_WIDTH} -component [ipx::current_core] \-parent [ipgui::get_pagespec -name Basic -component [ipx::current_core]]# 保存和关闭IP
ipx::save_core [ipx::current_core]
ipx::check_integrity [ipx::current_core]
ipx::archive_core $ip_proj_dir/conv_engine_ip_1.0.zip [ipx::current_core]close_projectputs "IP核打包完成!"
puts "IP保存位置: $ip_proj_dir"
连接系统组件
创建TCL脚本 build_system_bd.tcl
:
# 打开Block Design
open_bd_design {conv_engine_system.bd}# 1. 添加PS
if {[llength [get_bd_cells zynq_ultra_ps_e_0]] == 0} {create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.5 zynq_ultra_ps_e_0apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e \-config {apply_board_preset "1"} [get_bd_cells zynq_ultra_ps_e_0]
}# 2. 配置PS
set_property -dict [list \CONFIG.PSU__USE__M_AXI_GP0 {1} \CONFIG.PSU__USE__M_AXI_GP1 {0} \CONFIG.PSU__USE__S_AXI_GP0 {1} \CONFIG.PSU__USE__S_AXI_GP2 {1} \CONFIG.PSU__SAXIGP0__DATA_WIDTH {128} \CONFIG.PSU__SAXIGP2__DATA_WIDTH {128} \CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ {200} \CONFIG.PSU__CRL_APB__PL1_REF_CTRL__FREQMHZ {100} \
] [get_bd_cells zynq_ultra_ps_e_0]# 3. 添加卷积引擎IP
create_bd_cell -type ip -vlnv user.org:user:conv_engine:1.0 conv_engine_0# 4. 添加AXI互连
create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_interconnect_0
set_property -dict [list CONFIG.NUM_SI {1} CONFIG.NUM_MI {1}] [get_bd_cells axi_interconnect_0]# 5. 添加DMA控制器
create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
set_property -dict [list \CONFIG.c_include_sg {0} \CONFIG.c_sg_include_stscntrl_strm {0} \CONFIG.c_sg_length_width {26} \CONFIG.c_m_axi_mm2s_data_width {64} \CONFIG.c_m_axis_mm2s_tdata_width {64} \CONFIG.c_mm2s_burst_size {256} \CONFIG.c_m_axi_s2mm_data_width {64} \CONFIG.c_s_axis_s2mm_tdata_width {64} \CONFIG.c_s2mm_burst_size {256} \
] [get_bd_cells axi_dma_0]# 6. 添加AXI SmartConnect用于高性能数据传输
create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 smartconnect_0
set_property -dict [list CONFIG.NUM_SI {2} CONFIG.NUM_MI {1}] [get_bd_cells smartconnect_0]# 7. 连接控制路径(PS到卷积引擎)
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \Clk_master {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} \Slave {/conv_engine_0/s_axi} \ddr_seg {Auto} \intc_ip {/axi_interconnect_0} \master_apm {0}} [get_bd_intf_pins conv_engine_0/s_axi]# 8. 连接DMA控制
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \Clk_master {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} \Slave {/axi_dma_0/S_AXI_LITE} \ddr_seg {Auto} \intc_ip {/axi_interconnect_0} \master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE]# 9. 连接数据路径(DMA到DDR)
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \Clk_master {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \Master {/axi_dma_0/M_AXI_MM2S} \Slave {/zynq_ultra_ps_e_0/S_AXI_HP0_FPD} \ddr_seg {Auto} \intc_ip {/smartconnect_0} \master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HP0_FPD]apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \Clk_master {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (200 MHz)} \Master {/axi_dma_0/M_AXI_S2MM} \Slave {/zynq_ultra_ps_e_0/S_AXI_HP0_FPD} \ddr_seg {Auto} \intc_ip {/smartconnect_0} \master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HP0_FPD]# 10. 连接Stream接口
connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] \[get_bd_intf_pins conv_engine_0/s_axis]
connect_bd_intf_net [get_bd_intf_pins conv_engine_0/m_axis] \[get_bd_intf_pins axi_dma_0/S_AXIS_S2MM]# 11. 连接时钟
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_clk0] \[get_bd_pins conv_engine_0/clk]
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_clk0] \[get_bd_pins axi_dma_0/m_axi_mm2s_aclk]
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_clk0] \[get_bd_pins axi_dma_0/m_axi_s2mm_aclk]# 12. 连接复位
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_resetn0] \[get_bd_pins conv_engine_0/rst_n]
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_resetn0] \[get_bd_pins axi_dma_0/axi_resetn]# 13. 添加中断控制器
create_bd_cell -type ip -vlnv xilinx.com:ip:axi_intc:4.1 axi_intc_0# 连接中断
connect_bd_net [get_bd_pins axi_dma_0/mm2s_introut] \[get_bd_pins axi_intc_0/intr]
connect_bd_net [get_bd_pins axi_intc_0/irq] \[get_bd_pins zynq_ultra_ps_e_0/pl_ps_irq0]# 14. 添加ILA调试核心(可选)
create_bd_cell -type ip -vlnv xilinx.com:ip:system_ila:1.1 system_ila_0
set_property -dict [list CONFIG.C_SLOT_0_AXI_PROTOCOL {AXI4S}] [get_bd_cells system_ila_0]
connect_bd_intf_net [get_bd_intf_pins system_ila_0/SLOT_0_AXIS] \[get_bd_intf_pins conv_engine_0/m_axis]
connect_bd_net [get_bd_pins zynq_ultra_ps_e_0/pl_clk0] \[get_bd_pins system_ila_0/clk]# 15. 验证设计
validate_bd_design# 16. 保存Block Design
save_bd_design# 17. 生成输出产品
generate_target all [get_files conv_engine_system.bd]# 18. 创建HDL Wrapper
make_wrapper -files [get_files conv_engine_system.bd] -top
add_files -norecurse conv_engine_system_wrapper.v# 19. 设置顶层
set_property top conv_engine_system_wrapper [current_fileset]
生成比特流
运行完整实现流程
# 运行综合
launch_runs synth_1 -jobs 8
wait_on_run synth_1# 运行实现
launch_runs impl_1 -jobs 8
wait_on_run impl_1# 生成比特流
launch_runs impl_1 -to_step write_bitstream -jobs 8
wait_on_run impl_1# 导出硬件(包含比特流)
write_hw_platform -fixed -include_bit -force \-file ./conv_engine_system.xsa
Linux驱动
创建内核驱动
创建文件 conv_engine_driver.c
:
/*** 卷积引擎Linux内核驱动*/#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/platform_device.h>
#include <linux/interrupt.h>
#include <linux/dma-mapping.h>
#include <linux/slab.h>
#include <linux/of.h>
#include <linux/of_device.h>
#include <linux/of_dma.h>
#include <linux/dmaengine.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/cdev.h>#define DRIVER_NAME "conv_engine"
#define DEVICE_NAME "conv_engine"/* 寄存器偏移定义 */
#define CONV_CTRL_REG 0x00
#define CONV_STATUS_REG 0x04
#define CONV_IMG_SIZE_REG 0x08
#define CONV_KERNEL_REG 0x0C
#define CONV_CHANNEL_REG 0x10
#define CONV_STRIDE_PAD_REG 0x14
#define CONV_PERF_CNT_REG 0x18/* 控制位定义 */
#define CTRL_START_BIT (1 << 0)
#define CTRL_CLEAR_BIT (1 << 1)
#define CTRL_WEIGHT_LOAD (1 << 2)/* 状态位定义 */
#define STATUS_BUSY_BIT (1 << 0)
#define STATUS_DONE_BIT (1 << 1)struct conv_engine_dev {void __iomem *regs;struct device *dev;struct cdev cdev;dev_t devno;/* DMA相关 */struct dma_chan *tx_chan;struct dma_chan *rx_chan;dma_addr_t tx_dma_handle;dma_addr_t rx_dma_handle;void *tx_virt;void *rx_virt;size_t dma_size;/* 中断 */int irq;struct completion dma_complete;/* 性能统计 */u32 total_inferences;u64 total_cycles;
};static struct class *conv_engine_class;/* 寄存器读写函数 */
static inline u32 conv_read_reg(struct conv_engine_dev *dev, u32 offset)
{return ioread32(dev->regs + offset);
}static inline void conv_write_reg(struct conv_engine_dev *dev, u32 offset, u32 value)
{iowrite32(value, dev->regs + offset);
}/* 中断处理函数 */
static irqreturn_t conv_engine_isr(int irq, void *dev_id)
{struct conv_engine_dev *dev = dev_id;u32 status;status = conv_read_reg(dev, CONV_STATUS_REG);if (status & STATUS_DONE_BIT) {/* 清除完成标志 */conv_write_reg(dev, CONV_CTRL_REG, CTRL_CLEAR_BIT);/* 更新统计 */dev->total_inferences++;dev->total_cycles += conv_read_reg(dev, CONV_PERF_CNT_REG);/* 通知完成 */complete(&dev->dma_complete);return IRQ_HANDLED;}return IRQ_NONE;
}/* DMA回调函数 */
static void dma_complete_callback(void *completion)
{complete(completion);
}/* 配置并启动DMA传输 */
static int conv_engine_dma_transfer(struct conv_engine_dev *dev, void *src, size_t len, bool is_tx)
{struct dma_async_tx_descriptor *tx_desc;struct dma_chan *chan;dma_addr_t dma_src, dma_dst;struct completion *cmp = &dev->dma_complete;dma_cookie_t cookie;int ret;chan = is_tx ? dev->tx_chan : dev->rx_chan;if (is_tx) {/* 发送数据到设备 */memcpy(dev->tx_virt, src, len);dma_src = dev->tx_dma_handle;dma_dst = 0; /* 设备地址由DMA控制器管理 */} else {/* 从设备接收数据 */dma_src = 0;dma_dst = dev->rx_dma_handle;}/* 准备DMA描述符 */tx_desc = dmaengine_prep_slave_single(chan, is_tx ? dma_src : dma_dst,len,is_tx ? DMA_MEM_TO_DEV : DMA_DEV_TO_MEM,DMA_CTRL_ACK | DMA_PREP_INTERRUPT);if (!tx_desc) {dev_err(dev->dev, "Failed to prepare DMA descriptor\n");return -ENOMEM;}/* 设置回调 */tx_desc->callback = dma_complete_callback;tx_desc->callback_param = cmp;/* 提交DMA传输 */init_completion(cmp);cookie = dmaengine_submit(tx_desc);if (dma_submit_error(cookie)) {dev_err(dev->dev, "Failed to submit DMA\n");return -EINVAL;}/* 启动DMA */dma_async_issue_pending(chan);/* 等待完成 */ret = wait_for_completion_timeout(cmp, msecs_to_jiffies(5000));if (ret == 0) {dev_err(dev->dev, "DMA timeout\n");dmaengine_terminate_all(chan);return -ETIMEDOUT;}if (!is_tx) {/* 复制接收的数据 */memcpy(src, dev->rx_virt, len);}return 0;
}/* 运行卷积推理 */
static int conv_engine_run_inference(struct conv_engine_dev *dev,void *input_data, size_t input_size,void *output_data, size_t output_size)
{int ret;u32 status;/* 检查设备是否忙 */status = conv_read_reg(dev, CONV_STATUS_REG);if (status & STATUS_BUSY_BIT) {dev_err(dev->dev, "Device is busy\n");return -EBUSY;}/* 发送输入数据 */ret = conv_engine_dma_transfer(dev, input_data, input_size, true);if (ret) {dev_err(dev->dev, "Failed to send input data\n");return ret;}/* 启动卷积 */conv_write_reg(dev, CONV_CTRL_REG, CTRL_START_BIT);/* 等待完成中断 */ret = wait_for_completion_timeout(&dev->dma_complete, msecs_to_jiffies(1000));if (ret == 0) {dev_err(dev->dev, "Inference timeout\n");return -ETIMEDOUT;}/* 接收输出数据 */ret = conv_engine_dma_transfer(dev, output_data, output_size, false);if (ret) {dev_err(dev->dev, "Failed to receive output data\n");return ret;}return 0;
}/* 文件操作函数 */
static int conv_engine_open(struct inode *inode, struct file *file)
{struct conv_engine_dev *dev;dev = container_of(inode->i_cdev, struct conv_engine_dev, cdev);file->private_data = dev;return 0;
}static int conv_engine_release(struct inode *inode, struct file *file)
{return 0;
}static long conv_engine_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{struct conv_engine_dev *dev = file->private_data;int ret = 0;switch (cmd) {case 0x1001: /* 设置图像尺寸 */conv_write_reg(dev, CONV_IMG_SIZE_REG, arg);break;case 0x1002: /* 设置卷积核尺寸 */conv_write_reg(dev, CONV_KERNEL_REG, arg);break;case 0x1003: /* 设置通道数 */conv_write_reg(dev, CONV_CHANNEL_REG, arg);break;case 0x1004: /* 获取性能计数 */ret = put_user(dev->total_cycles / dev->total_inferences, (u32 __user *)arg);break;default:ret = -EINVAL;}return ret;
}static const struct file_operations conv_engine_fops = {.owner = THIS_MODULE,.open = conv_engine_open,.release = conv_engine_release,.unlocked_ioctl = conv_engine_ioctl,
};/* 平台驱动probe函数 */
static int conv_engine_probe(struct platform_device *pdev)
{struct conv_engine_dev *dev;struct resource *res;int ret;dev_info(&pdev->dev, "Probing conv_engine driver\n");/* 分配设备结构 */dev = devm_kzalloc(&pdev->dev, sizeof(*dev), GFP_KERNEL);if (!dev)return -ENOMEM;dev->dev = &pdev->dev;/* 获取并映射寄存器 */res = platform_get_resource(pdev, IORESOURCE_MEM, 0);dev->regs = devm_ioremap_resource(&pdev->dev, res);if (IS_ERR(dev->regs))return PTR_ERR(dev->regs);/* 获取中断 */dev->irq = platform_get_irq(pdev, 0);if (dev->irq < 0)return dev->irq;/* 注册中断处理 */ret = devm_request_irq(&pdev->dev, dev->irq, conv_engine_isr,IRQF_SHARED, DRIVER_NAME, dev);if (ret) {dev_err(&pdev->dev, "Failed to request IRQ\n");return ret;}/* 获取DMA通道 */dev->tx_chan = dma_request_slave_channel(&pdev->dev, "tx");if (!dev->tx_chan) {dev_err(&pdev->dev, "Failed to request TX DMA channel\n");return -EPROBE_DEFER;}dev->rx_chan = dma_request_slave_channel(&pdev->dev, "rx");if (!dev->rx_chan) {dev_err(&pdev->dev, "Failed to request RX DMA channel\n");dma_release_channel(dev->tx_chan);return -EPROBE_DEFER;}/* 分配DMA缓冲区 */dev->dma_size = 640 * 640 * 3; /* 最大图像尺寸 */dev->tx_virt = dma_alloc_coherent(&pdev->dev, dev->dma_size,&dev->tx_dma_handle, GFP_KERNEL);if (!dev->tx_virt) {dev_err(&pdev->dev, "Failed to allocate TX DMA buffer\n");ret = -ENOMEM;goto err_dma_alloc;}dev->rx_virt = dma_alloc_coherent(&pdev->dev, dev->dma_size,&dev->rx_dma_handle, GFP_KERNEL);if (!dev->rx_virt) {dev_err(&pdev->dev, "Failed to allocate RX DMA buffer\n");ret = -ENOMEM;goto err_rx_alloc;}/* 初始化完成量 */init_completion(&dev->dma_complete);/* 注册字符设备 */ret = alloc_chrdev_region(&dev->devno, 0, 1, DEVICE_NAME);if (ret < 0) {dev_err(&pdev->dev, "Failed to allocate char device region\n");goto err_chrdev;}cdev_init(&dev->cdev, &conv_engine_fops);dev->cdev.owner = THIS_MODULE;ret = cdev_add(&dev->cdev, dev->devno, 1);if (ret) {dev_err(&pdev->dev, "Failed to add char device\n");goto err_cdev_add;}/* 创建设备节点 */device_create(conv_engine_class, &pdev->dev, dev->devno,NULL, DEVICE_NAME);platform_set_drvdata(pdev, dev);dev_info(&pdev->dev, "Conv engine driver probed successfully\n");return 0;err_cdev_add:unregister_chrdev_region(dev->devno, 1);
err_chrdev:dma_free_coherent(&pdev->dev, dev->dma_size,dev->rx_virt, dev->rx_dma_handle);
err_rx_alloc:dma_free_coherent(&pdev->dev, dev->dma_size,dev->tx_virt, dev->tx_dma_handle);
err_dma_alloc:dma_release_channel(dev->rx_chan);dma_release_channel(dev->tx_chan);return ret;
}static int conv_engine_remove(struct platform_device *pdev)
{struct conv_engine_dev *dev = platform_get_drvdata(pdev);device_destroy(conv_engine_class, dev->devno);cdev_del(&dev->cdev);unregister_chrdev_region(dev->devno, 1);dma_free_coherent(&pdev->dev, dev->dma_size,dev->rx_virt, dev->rx_dma_handle);dma_free_coherent(&pdev->dev, dev->dma_size,dev->tx_virt, dev->tx_dma_handle);dma_release_channel(dev->rx_chan);dma_release_channel(dev->tx_chan);return 0;
}static const struct of_device_id conv_engine_of_match[] = {{ .compatible = "xlnx,conv-engine-1.0", },{},
};
MODULE_DEVICE_TABLE(of, conv_engine_of_match);static struct platform_driver conv_engine_driver = {.driver = {.name = DRIVER_NAME,.of_match_table = conv_engine_of_match,},.probe = conv_engine_probe,.remove = conv_engine_remove,
};static int __init conv_engine_init(void)
{int ret;conv_engine_class = class_create(THIS_MODULE, DEVICE_NAME);if (IS_ERR(conv_engine_class))return PTR_ERR(conv_engine_class);ret = platform_driver_register(&conv_engine_driver);if (ret)class_destroy(conv_engine_class);return ret;
}static void __exit conv_engine_exit(void)
{platform_driver_unregister(&conv_engine_driver);class_destroy(conv_engine_class);
}module_init(conv_engine_init);
module_exit(conv_engine_exit);MODULE_DESCRIPTION("Convolution Engine Driver for YOLO V10");
MODULE_AUTHOR("Your Name");
MODULE_LICENSE("GPL v2");
创建设备树配置
创建文件 conv_engine.dtsi
:
/ {conv_engine_0: conv_engine@a0000000 {compatible = "xlnx,conv-engine-1.0";reg = <0x0 0xa0000000 0x0 0x10000>;interrupt-parent = <&gic>;interrupts = <0 89 4>;interrupt-names = "conv_irq";dmas = <&axi_dma_0 0&axi_dma_0 1>;dma-names = "tx", "rx";clocks = <&zynqmp_clk 71>;clock-names = "axi_clk";};axi_dma_0: dma@a0010000 {compatible = "xlnx,axi-dma-7.1";reg = <0x0 0xa0010000 0x0 0x10000>;interrupt-parent = <&gic>;interrupts = <0 90 40 91 4>;interrupt-names = "mm2s_introut", "s2mm_introut";clocks = <&zynqmp_clk 71>, <&zynqmp_clk 71>;clock-names = "s_axi_lite_aclk", "m_axi_sg_aclk";#dma-cells = <1>;dma-channels = <2>;};
};
创建用户空间应用
Python接口
创建文件 conv_engine_python.py
:
#!/usr/bin/env python3
"""
卷积引擎Python接口
"""import numpy as np
import mmap
import os
import struct
import time
from ctypes import *class ConvEngine:def __init__(self):# 打开设备文件self.dev_file = "/dev/conv_engine"self.fd = os.open(self.dev_file, os.O_RDWR)# 寄存器偏移self.CTRL_REG = 0x00self.STATUS_REG = 0x04self.IMG_SIZE_REG = 0x08self.KERNEL_REG = 0x0Cself.CHANNEL_REG = 0x10self.STRIDE_PAD_REG = 0x14self.PERF_CNT_REG = 0x18# 默认配置self.img_height = 640self.img_width = 640self.kernel_size = 3self.in_channels = 3self.out_channels = 16self.stride = 1self.padding = 1def configure(self, img_height=640, img_width=640, kernel_size=3, in_channels=3, out_channels=16,stride=1, padding=1):"""配置卷积参数"""self.img_height = img_heightself.img_width = img_widthself.kernel_size = kernel_sizeself.in_channels = in_channelsself.out_channels = out_channelsself.stride = strideself.padding = padding# 写入配置寄存器img_size = (img_height << 16) | img_widthkernel_cfg = (kernel_size << 8) | kernel_sizechannel_cfg = (in_channels << 16) | out_channelsstride_pad = (padding << 4) | strideself._ioctl(0x1001, img_size)self._ioctl(0x1002, kernel_cfg)self._ioctl(0x1003, channel_cfg)def load_weights(self, weights):"""加载卷积权重"""# 将权重量化为INT8weights_int8 = np.clip(weights * 127, -128, 127).astype(np.int8)# 打包权重数据weight_bytes = weights_int8.tobytes()# 发送到设备os.write(self.fd, weight_bytes)def run_inference(self, input_image):"""运行推理"""# 预处理输入图像if input_image.shape != (self.img_height, self.img_width, self.in_channels):raise ValueError(f"输入图像尺寸不匹配,期望{(self.img_height, self.img_width, self.in_channels)}")# 量化为INT8input_int8 = np.clip(input_image * 127, -128, 127).astype(np.int8)# 开始计时start_time = time.time()# 发送输入数据input_bytes = input_int8.tobytes()os.write(self.fd, input_bytes)# 等待完成并读取输出output_size = self._calculate_output_size()output_bytes = os.read(self.fd, output_size)# 结束计时end_time = time.time()# 解析输出output_array = np.frombuffer(output_bytes, dtype=np.int32)output_shape = self._calculate_output_shape()output_array = output_array.reshape(output_shape)# 反量化output_float = output_array.astype(np.float32) / 127.0inference_time = (end_time - start_time) * 1000 # msreturn output_float, inference_timedef get_performance_stats(self):"""获取性能统计"""avg_cycles = self._ioctl(0x1004, 0)# 假设200MHz时钟clock_freq = 200e6avg_time_ms = (avg_cycles / clock_freq) * 1000return {'average_cycles': avg_cycles,'average_time_ms': avg_time_ms,'throughput_fps': 1000.0 / avg_time_ms if avg_time_ms > 0 else 0}def _calculate_output_size(self):"""计算输出数据大小"""out_h = (self.img_height + 2*self.padding - self.kernel_size) // self.stride + 1out_w = (self.img_width + 2*self.padding - self.kernel_size) // self.stride + 1return out_h * out_w * self.out_channels * 4 # INT32def _calculate_output_shape(self):"""计算输出形状"""out_h = (self.img_height + 2*self.padding - self.kernel_size) // self.stride + 1out_w = (self.img_width + 2*self.padding - self.kernel_size) // self.stride + 1return (out_h, out_w, self.out_channels)def _ioctl(self, cmd, arg):"""IOCTL调用"""import fcntlreturn fcntl.ioctl(self.fd, cmd, arg)def benchmark(self, num_iterations=100):"""性能基准测试"""print("开始性能基准测试...")# 创建随机输入test_input = np.random.randn(self.img_height, self.img_width, self.in_channels)times = []for i in range(num_iterations):_, time_ms = self.run_inference(test_input)times.append(time_ms)if (i+1) % 10 == 0:print(f" 完成 {i+1}/{num_iterations} 次推理")times = np.array(times)print("\n基准测试结果:")print(f" 平均延迟: {np.mean(times):.2f} ms")print(f" 最小延迟: {np.min(times):.2f} ms")print(f" 最大延迟: {np.max(times):.2f} ms")print(f" 标准差: {np.std(times):.2f} ms")print(f" 吞吐量: {1000.0/np.mean(times):.2f} FPS")return timesdef __del__(self):"""清理资源"""if hasattr(self, 'fd'):os.close(self.fd)# 使用示例
if __name__ == "__main__":# 创建卷积引擎实例engine = ConvEngine()# 配置参数engine.configure(img_height=32,img_width=32,kernel_size=3,in_channels=3,out_channels=16)# 加载测试权重test_weights = np.random.randn(16, 3, 3, 3) # [out_ch, in_ch, k_h, k_w]engine.load_weights(test_weights)# 运行推理test_image = np.random.randn(32, 32, 3)output, time_ms = engine.run_inference(test_image)print(f"推理完成!")print(f" 输出形状: {output.shape}")print(f" 推理时间: {time_ms:.2f} ms")# 获取性能统计stats = engine.get_performance_stats()print(f"\n性能统计:")print(f" 平均周期数: {stats['average_cycles']}")print(f" 平均时间: {stats['average_time_ms']:.2f} ms")print(f" 吞吐量: {stats['throughput_fps']:.2f} FPS")# 运行基准测试engine.benchmark(100)