郫县哪里有做网站的织梦学校网站源码
郫县哪里有做网站的,织梦学校网站源码,西安风险等级最新,深圳北站设计方案RDMA通过kernel-bypass和协议栈offload两大核心技术#xff0c;实现了远高于传统TCP/IP的网络通信性能。尽管RDMA的性能要远好于TCP/IP#xff0c;但目前RDMA的实际落地业务场景却寥寥无几#xff0c;这其中制约RDMA技术大规模上线应用的主要原因有两点#xff1a;
主流互…RDMA通过kernel-bypass和协议栈offload两大核心技术实现了远高于传统TCP/IP的网络通信性能。尽管RDMA的性能要远好于TCP/IP但目前RDMA的实际落地业务场景却寥寥无几这其中制约RDMA技术大规模上线应用的主要原因有两点
主流互联网公司普遍选择RoCERDMA over Converged Ethernet作为RDMA部署方案而RoCE本质上是RDMA over UDP在网络上无法保证不丢包。因此RoCE部署方案需要额外的拥塞控制机制来保证底层的无损网络如PFC、ECN等这给大规模的上线部署带来挑战。而且目前各大厂商对硬件拥塞控制的支持均还不完善存在兼容性问题。RDMA提供了完全不同于socket的编程接口因此要想使用RDMA需要对现有应用进行改造。而RDMA原生编程APIverbs/RDMA_CM比较复杂需要对RDMA技术有深入理解才能做好开发学习成本较高。
为了降低应用程序的改造成本决定研发一个RDMA通信库该通信库直接基于ibvebrs和RDMA_CM避免对其他第三方库的调用。 本文主要对rdma编程的事件通知机制进行归纳总结。
传统socket编程中通常采用IO复用技术select、poll、epoll等来实现事件通知机制那么对于rdma是否可以同样基于IO复用技术来实现事件通知机制答案是完全可以。 1. RDMA_CM APIFor Connection
在rdma编程时可以直接通过RDMA_CM API来建立RDMA连接。
对rdma_create_id函数进行分析其主要创建了rdma_cm_id对象并将其注册到驱动中。
int rdma_create_id(struct rdma_event_channel *channel,
struct rdma_cm_id **id, void *context,
enum rdma_port_space ps)
{
enum ibv_qp_type qp_type (ps RDMA_PS_IPOIB || ps RDMA_PS_UDP) ?
IBV_QPT_UD : IBV_QPT_RC;
ret ucma_init(); //查询获取所有IB设备存放在cma_dev_array全局数组中检测是否支持AF_IB协议 struct cma_id_private *id_priv
ucma_alloc_id(channel, context, ps, qp_type); //创建并初始化id_priv对象若未创建rdma_event_channel那么调用rdma_create_event_channel创建一个。 CMA_INIT_CMD_RESP(cmd, sizeof cmd, CREATE_ID, resp, sizeof resp);
cmd.uid (uintptr_t) id_priv;
cmd.ps ps;
cmd.qp_type qp_type; ret write(id_priv-id.channel-fd, cmd, sizeof cmd); //将id_priv相关信息注册到内核驱动中不做过多分析
*id id_priv-id; //返回rdma_cm_id对象
}
rdma_cm_id数据结构定义如下
struct rdma_cm_id {
struct ibv_context *verbs; //ibv_open_device
struct rdma_event_channel *channel; //rdma_create_event_channel创建For Setup connection
void *context; //user specified context
struct ibv_qp *qp; //rdma_create_qp底层调用的是ibv_create_qp
struct rdma_route route;
enum rdma_port_space ps; //RDMA_PS_IPOIB or RDMA_PS_UDP or RDMA_PS_TCP
uint8_t port_num; //port数目
struct rdma_cm_event *event; //rdma_cm相关的事件events
struct ibv_comp_channel *send_cq_channel; //ibv_create_comp_channel创建For data transfer
struct ibv_cq *send_cq; //发送CQ通常和recv_cq是同一个CQ
struct ibv_comp_channel *recv_cq_channel; //ibv_create_comp_channel创建For data transfer
struct ibv_cq *recv_cq; //接收CQ通常和send_cq是同一个CQ
struct ibv_srq *srq;
struct ibv_pd *pd; //ibv_open_device
enum ibv_qp_type qp_type; //IBV_QPT_RC or IBV_QPT_UD
};
在创建rdma_cm_id时如果预先没有创建rdma_event_channel那么需要调用rdma_create_event_channel函数。
struct rdma_event_channel *rdma_create_event_channel(void)
{
struct rdma_event_channel *channel; if (ucma_init()) //通过static局部变量保证只做一次初始化
return NULL; channel malloc(sizeof *channel); //创建rdma_event_channel
if (!channel)
return NULL; channel-fd open(/dev/infiniband/rdma_cm, O_RDWR | O_CLOEXEC); //可以看出rdma_event_channel本质上就是一个fd
if (channel-fd 0) {
goto err;
}
return channel;
err:
free(channel);
return NULL;
}
rdma_event_channel的定义如下 struct rdma_event_channel {
int fd;
}
1.1 RDMA_CM原生事件通知实现(in block way)
static int cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event); ret rdma_get_cm_event(channel, event); //阻塞操作直到有rdma_cm event发生才返回
if (!ret) {
ret cma_handler(event-id, event); //处理事件
rdma_ack_cm_event(event); //ack event
} static int cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) {
int ret 0;
switch (event-event)
{
case RDMA_CM_EVENT_ADDR_RESOLVED:
ret addr_handler(cma_id-context);
break;
case RDMA_CM_EVENT_MULTICAST_JOIN:
ret join_handler(cma_id-context, event-param.ud);
break;
case RDMA_CM_EVENT_ADDR_ERROR:
case RDMA_CM_EVENT_ROUTE_ERROR:
case RDMA_CM_EVENT_MULTICAST_ERROR:
printf(mckey: event: %s, error: %d\n, rdma_event_str(event-event), event-status); connect_error();
ret event-status;
break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
/* Cleanup will occur after test completes. */
break;
default:
break;
}
可以看出RDMA_CM的fd所侦测的都是建立连接相关的event其不涉及数据传输相关的event所以rdma_cm event只用于通知建连相关事件
enum rdma_cm_event_type {
RDMA_CM_EVENT_ADDR_RESOLVED,
RDMA_CM_EVENT_ADDR_ERROR,
RDMA_CM_EVENT_ROUTE_RESOLVED,
RDMA_CM_EVENT_ROUTE_ERROR,
RDMA_CM_EVENT_CONNECT_REQUEST,
RDMA_CM_EVENT_CONNECT_RESPONSE,
RDMA_CM_EVENT_CONNECT_ERROR,
RDMA_CM_EVENT_UNREACHABLE,
RDMA_CM_EVENT_REJECTED,
RDMA_CM_EVENT_ESTABLISHED,
RDMA_CM_EVENT_DISCONNECTED,
RDMA_CM_EVENT_DEVICE_REMOVAL,
RDMA_CM_EVENT_MULTICAST_JOIN,
RDMA_CM_EVENT_MULTICAST_ERROR,
RDMA_CM_EVENT_ADDR_CHANGE,
RDMA_CM_EVENT_TIMEWAIT_EXIT
};
1.2 IO复用poll/epollin non-block way
rdma_cm fd不同于传统socket fd其只会向上抛POLLIN事件表示有rdma_cm event事件发生具体event类型需要通过rdma_get_cm_event来获取。
/* change the blocking mode of the completion channel */
flags fcntl(cm_id-channel-fd, F_GETFL);
rc fcntl(cm_id-channel-fd, F_SETFL, flags | O_NONBLOCK); //设置rdma_cm fd为NONBLOCK
if (rc 0) {
fprintf(stderr, Failed to change file descriptor of Completion Event Channel\n);
return -1;
} struct pollfd my_pollfd;
int ms_timeout 10; /*
* poll the channel until it has an event and sleep ms_timeout
* milliseconds between any iteration
*/
my_pollfd.fd cm_id-channel-fd;
my_pollfd.events POLLIN; //只需要监听POLLIN事件POLLIN事件意味着有rdma_cm event发生
my_pollfd.revents 0;
do {
rc poll(my_pollfd, 1, ms_timeout); //非阻塞操作有事件或者超时时返回
} while (rc 0); /* 注意poll监听到有事件发生只意味着有rdma_cm event事件发生但具体event仍然需要通过rdma_get_cm_event来获取。*/
ret rdma_get_cm_event(channel, event);
if (!ret) {
ret cma_handler(event-id, event); //处理收到的事件
rdma_ack_cm_event(event); //ack event
} 2. verbs APIFor data transfer
从上一节可以看出RDMA_CM中的fd只涉及建连相关的事件其无法获取数据传输相关的事件。 对于RDMA传输数据传输是由NIC硬件完成的完全不需要CPU参与。网卡硬件完成数据传输后会向CQcompletion queue中提交一个cqe用于描述数据传输完成情况。
struct ibv_cq *ibv_create_cq(struct ibv_context *context, int cqe,
void *cq_context, struct ibv_comp_channel *channel, int comp_vector) # 作用创建CQ每个QP都有对应的send cq和recv cq。
# 一个CQ可以被同一个QP的send queue和recv queue共享也可以被多个不同的QP共享 # 注意CQ仅仅只是一个queue其本身没有built-in的事件通知机制。如果想要增加事件通知机制那么需要指定channel对象。
verbs API提供了创建ibv_comp_channel的编程接口:
struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context)
# 作用创建completion channel用于向user通知有新的completion queue eventcqe已经被写入CQ中。
struct ibv_comp_channel {
struct ibv_context *context;
int fd;
int refcnt;
};$ 2.1 Verbs原生事件通知实现in block way
struct ibv_context *context;
struct ibv_cq *cq;
void *ev_ctx NULL; /* can be initialized with other values for the CQ context */ /* Create a CQ, which is associated with a Completion Event Channel */
cq ibv_create_cq(ctx, 1, ev_ctx, channel, 0);
if (!cq) {
fprintf(stderr, Failed to create CQ\n);
return -1;
}
/* Request notification before any completion can be created (to prevent races) */
ret ibv_req_notify_cq(cq, 0);
if (ret) {
fprintf(stderr, Couldnt request CQ notification\n);
return -1;
}
/* The following code will be called each time you need to read a Work Completion */
struct ibv_cq *ev_cq;
void *ev_ctx;
int ret;
int ne;
/* Wait for the Completion event */
ret ibv_get_cq_event(channel, ev_cq, ev_ctx); //阻塞函数直到有cqe发生才返回ev_cq指向发生cqe的CQ
if (ret) {
fprintf(stderr, Failed to get CQ event\n);
return -1;
}
/* Ack the event */
ibv_ack_cq_events(ev_cq, 1);
/* Request notification upon the next completion event */
ret ibv_req_notify_cq(ev_cq, 0);
if (ret) {
fprintf(stderr, Couldnt request CQ notification\n);
return -1;
}
/* Empty the CQ: poll all of the completions from the CQ (if any exist) */
do {
ne ibv_poll_cq(cq, 1, wc);
if (ne 0) {
fprintf(stderr, Failed to poll completions from the CQ: ret %d\n,
ne);
return -1;
}
/* there may be an extra event with no completion in the CQ */
if (ne 0)
continue;
if (wc.status ! IBV_WC_SUCCESS) {
fprintf(stderr, Completion with status 0x%x was found\n,
wc.status);
return -1;
}
} while (ne); 2.2 IO复用poll/epollin non-block way
利用fcntl设置channel-fd的属性为non-block然后就可以用poll/epoll/select等来监听channel-fd的POLLIN事件POLLIN事件意味着有新的completion queue event被填入CQ中。user程序在被唤醒后无需像传统socket那样进行read/write操作(因为data已经直接DMA到用户态缓存中)而是需要做poll_cq操作对每一个cqe进行解析处理。
struct ibv_context *context;
struct ibv_cq *cq;
void *ev_ctx NULL; /* can be initialized with other values for the CQ context */ /* Create a CQ, which is associated with a Completion Event Channel */
cq ibv_create_cq(ctx, 1, ev_ctx, channel, 0);
if (!cq) {
fprintf(stderr, Failed to create CQ\n);
return -1;
} /* Request notification before any completion can be created (to prevent races) */
ret ibv_req_notify_cq(cq, 0);
if (ret) {
fprintf(stderr, Couldnt request CQ notification\n);
return -1;
} /* The following code will be called only once, after the Completion Event Channel
was createdto change the blocking mode of the completion channel */
int flags fcntl(channel-fd, F_GETFL);
rc fcntl(channel-fd, F_SETFL, flags | O_NONBLOCK);
if (rc 0) {
fprintf(stderr, Failed to change file descriptor of Completion Event Channel\n);
return -1;
} /* The following code will be called each time you need to read a Work Completion */
struct pollfd my_pollfd;
struct ibv_cq *ev_cq;
void *ev_ctx;
int ne;
int ms_timeout 10; /*
* poll the channel until it has an event and sleep ms_timeout
* milliseconds between any iteration
*/
my_pollfd.fd channel-fd;
my_pollfd.events POLLIN; //只需要监听POLLIN事件POLLIN事件意味着有新的cqe发生
my_pollfd.revents 0;
do {
rc poll(my_pollfd, 1, ms_timeout); //非阻塞函数有cqe事件或超时时退出
} while (rc 0);
if (rc 0) {
fprintf(stderr, poll failed\n);
return -1;
}
ev_cq cq; /* Wait for the completion event */
ret ibv_get_cq_event(channel, ev_cq, ev_ctx); //获取completion queue event。对于epoll水平触发模式必须要执行ibv_get_cq_event并将该cqe取出否则会不断重复唤醒epoll
if (ret) {
fprintf(stderr, Failed to get cq_event\n);
return -1;
}
/* Ack the event */
ibv_ack_cq_events(ev_cq, 1); //ack cqe /* Request notification upon the next completion event */
ret ibv_req_notify_cq(ev_cq, 0);
if (ret) {
fprintf(stderr, Couldnt request CQ notification\n);
return -1;
} /* Empty the CQ: poll all of the completions from the CQ (if any exist) */
do {
ne ibv_poll_cq(cq, 1, wc);
if (ne 0) {
fprintf(stderr, Failed to poll completions from the CQ: ret %d\n,
ne);
return -1;
}
/* there may be an extra event with no completion in the CQ */
if (ne 0)
continue; if (wc.status ! IBV_WC_SUCCESS) {
fprintf(stderr, Completion with status 0x%x was found\n,
wc.status);
return -1;
}
} while (ne); 3. rpoll实现rsocket
rsocket是附在rdma_cm库中的一个子模块提供了完全类似于socket接口的rdma调用。此处主要对rpoll的实现进行分析。
rpoll同时支持对rdma fd和正常socket fd进行监听但对于rdma fd其目前仅支持四种事件POLLIN、POLLOUT、POLLHUP、POLLERR。
* Note that we may receive events on an rsocket that may not be reported
* to the user (e.g. connection events or credit updates). Process those
* events, then return to polling until we find ones of interest.
*/
int rpoll(struct pollfd *fds, nfds_t nfds, int timeout)
{
struct timeval s, e;
struct pollfd *rfds;
uint32_t poll_time 0;
int ret; do {
ret rs_poll_check(fds, nfds); //主动轮询查看是否有event发生
if (ret || !timeout) //如果有event发生或者timeout为0直接返回
return ret; if (!poll_time)
gettimeofday(s, NULL); gettimeofday(e, NULL);
poll_time (e.tv_sec - s.tv_sec) * 1000000
(e.tv_usec - s.tv_usec) 1;
} while (poll_time polling_time); //尝试轮询polling_time时间该时间内如果有event发生那么直接返回否则进入后续逻辑 rfds rs_fds_alloc(nfds); //创建新的pollfd数组rfds用于添加到原生poll中。
if (!rfds)
return ERR(ENOMEM); do {
ret rs_poll_arm(rfds, fds, nfds); //对所有verbs fd进行arm操作并将待监听事件全部改为POLLIN
if (ret)
break; ret poll(rfds, nfds, timeout); //调用OS原生poll
if (ret 0)
break; ret rs_poll_events(rfds, fds, nfds); //将cqe或rdma_cm event转化为具体event
} while (!ret); rpoll中调用rs_poll_check进行轮询查看是否有event发生。
static int rs_poll_check(struct pollfd *fds, nfds_t nfds)
{
struct rsocket *rs;
int i, cnt 0; for (i 0; i nfds; i) {
rs idm_lookup(idm, fds[i].fd); //根据fd找到对应的rsocket对象
if (rs)
fds[i].revents rs_poll_rs(rs, fds[i].events, 1, rs_poll_all);
//查看rsocket fd是否有event发生手动向上抛事件
else
poll(fds[i], 1, 0); //普通fd非阻塞poll一次查询是否有event发生 if (fds[i].revents)
cnt;
}
return cnt;
} static int rs_poll_rs(struct rsocket *rs, int events,
int nonblock, int (*test)(struct rsocket *rs))
{
struct pollfd fds;
short revents;
int ret; check_cq:
if ((rs-type SOCK_STREAM) ((rs-state rs_connected) ||
(rs-state rs_disconnected) || (rs-state rs_error))) {
rs_process_cq(rs, nonblock, test); //调用ibv_poll_cq遍历cqe
//对于send cqe可以在处理函数中将发送缓存重新放回到内存池中
//对于recv cqe可以在处理函数中更新可读数据length和addr等 revents 0;
if ((events POLLIN) rs_conn_have_rdata(rs)) //接收缓存有数据抛POLLIN
事件
revents | POLLIN;
if ((events POLLOUT) rs_can_send(rs)) //发送缓存可写抛POLLOUT事件
revents | POLLOUT;
if (!(rs-state rs_connected)) {
if (rs-state rs_disconnected)
revents | POLLHUP; //断开连接抛POLLHUP事件
else
revents | POLLERR; //抛POLLERR事件
} return revents;
} else if (rs-type SOCK_DGRAM) { //UDP相关逻辑不关注
ds_process_cqs(rs, nonblock, test); revents 0;
if ((events POLLIN) rs_have_rdata(rs))
revents | POLLIN;
if ((events POLLOUT) ds_can_send(rs))
revents | POLLOUT; return revents;
} if (rs-state rs_listening) { //rmda_cm fd
fds.fd rs-cm_id-channel-fd;
fds.events events; //此处没有将要监听的事件设置为POLLINwhy
fds.revents 0;
poll(fds, 1, 0); //直接poll一次然后返回
return fds.revents;
} if (rs-state rs_opening) {
ret rs_do_connect(rs);
if (ret (errno EINPROGRESS)) {
errno 0;
} else {
goto check_cq;
}
} if (rs-state rs_connect_error) {
revents 0;
if (events POLLOUT)
revents | POLLOUT;
if (events POLLIN)
revents | POLLIN;
revents | POLLERR;
return revents;
} return 0;
} 当主动轮询polling_time时间后如果仍然没有event发生且尚未超时那么就需要调用rs_poll_arm函数其主要作用有两点1对所有verbs fd进行arm操作(ibv_notify_cq_event)2将所有rdma相关事件全部修改为监听POLLIN事件然后丢给原生poll函数去监听。
static int rs_poll_arm(struct pollfd *rfds, struct pollfd *fds, nfds_t nfds)
{
struct rsocket *rs;
int i; for (i 0; i nfds; i) {
rs idm_lookup(idm, fds[i].fd);
if (rs) { // rdma相关fd
fds[i].revents rs_poll_rs(rs, fds[i].events, 0, rs_is_cq_armed);
if (fds[i].revents)
return 1; if (rs-type SOCK_STREAM) {
if (rs-state rs_connected)
rfds[i].fd rs-cm_id-recv_cq_channel-fd; //verbs fd用于通知data传输event
else
rfds[i].fd rs-cm_id-channel-fd; //rdma_cm fd用于通知connect event
} else {
rfds[i].fd rs-epfd;
}
rfds[i].events POLLIN; //所有监听事件全部改为POLLIN
} else { //普通fd
rfds[i].fd fds[i].fd;
rfds[i].events fds[i].events;
}
rfds[i].revents 0;
}
return 0;
}
原生poll在超时时间内如果监听到有事件发生那么调用rs_poll_events函数。
static int rs_poll_events(struct pollfd *rfds, struct pollfd *fds, nfds_t nfds)
{
struct rsocket *rs;
int i, cnt 0; for (i 0; i nfds; i) {
if (!rfds[i].revents) //没有事件发生跳过
continue; rs idm_lookup(idm, fds[i].fd);
if (rs) {
fastlock_acquire(rs-cq_wait_lock);
if (rs-type SOCK_STREAM)
rs_get_cq_event(rs); //调用ibv_get_cq_event
else
ds_get_cq_event(rs);
fastlock_release(rs-cq_wait_lock);
fds[i].revents rs_poll_rs(rs, fds[i].events, 1, rs_poll_all); //手动向上抛事件
} else {
fds[i].revents rfds[i].revents; //普通fd直接向上抛事件
}
if (fds[i].revents)
cnt;
}
return cnt;
}
总结来看对于rpoll实现主要分两个步骤:
主动遍历轮询polling_time时间查看是否有event发生如果polling_time时间内没有event发生那么将verbs/rdma_cm fd直接注册到OS原生poll中并将待监听事件改为POLLIN然后调用原生poll。如果poll监听到verbs/rdma_cm fd的事件这只意味着有cqe事件或rdma_cm事件发生不能直接返回给用户需要额外进行逻辑判断以确定究竟是否要向上抛事件以及抛什么事件。
4. 总结
对于rdma编程目前主流实现是利用rdma_cm来建立连接然后利用verbs来传输数据。
rdma_cm和ibverbs分别会创建一个fd这两个fd的分工不同。rdma_cm fd主要用于通知建连相关的事件verbs fd则主要通知有新的cqe发生。当直接对rdma_cm fd进行poll/epoll监听时此时只能监听到POLLIN事件这意味着有rdma_cm事件发生。当直接对verbs fd进行poll/epoll监听时同样只能监听到POLLIN事件这意味着有新的cqe。
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/bicheng/88973.shtml
如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!