SPDK initiator模块可与远端的iscsi_tgt配合, 将SPDK的块存储bdev拉远到TGT端, bdev除了可对接iscsi, 还可对接ceph_rbd, nvmeof_tgt等
参考vhost client读远端spdk bdev为例
本文IO调用栈如下
APP WRITE/READ
BLK
VIRTIO_BLK
-------------
REMOTE SPDK BDEV
libiscsi.so
-------------
REMOTE iscsi_tgt
vhost_user_process_blk_request
struct virtio_blk_outhdr req
/* VIRTIO_BLK_T* */
__virtio32 type;
/* io priority. */
__virtio32 ioprio;
/* Sector (ie. 512 byte offset) */
__virtio64 sector;
task->cb = cb
iov = &task->iovs[0]
memcpy(&req, iov->iov_base, sizeof(req))
switch (type)
case VIRTIO_BLK_T_IN:
spdk_bdev_readv(bvdev->bdev_desc, ch,&task->iovs[1], iovcnt, req.sector * 512,payload_len,blk_request_complete_cb, task)
case VIRTIO_BLK_T_OUT
spdk_bdev_writev(bvdev->bdev_desc, ch, &task->iovs[1], iovcnt, req.sector * 512, payload_len, blk_request_complete_cb, task)
bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc)
spdk_bdev_writev_blocks
case VIRTIO_BLK_T_DISCARD
case VIRTIO_BLK_T_WRITE_ZEROES
spdk_bdev_readv 调用栈
spdk_bdev_readv
blk_request_complete_cb
blk_request_finish
vhost_user_blk_request_finish
blk_task_enqueue
vhost_vq_packed_ring_enqueue
blk_task_finish
spdk_bdev_readv
spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg)
bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, num_blocks, NULL, NULL, NULL, cb, cb_arg)
bdev_io = bdev_channel_get_io(channel)
bdev_io->u.bdev.iovs = iov
...
bdev->fn_table->submit_request(ioch, bdev_io)
static const struct spdk_bdev_fn_table iscsi_fn_table = {
.destruct = bdev_iscsi_destruct,
.submit_request = bdev_iscsi_submit_request,
_bdev_iscsi_submit_request
switch (bdev_io->type)
case SPDK_BDEV_IO_TYPE_READ:
spdk_bdev_io_get_buf(bdev_io, bdev_iscsi_get_buf_cb, bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen)
alignment = spdk_bdev_get_buf_align(bdev)
_are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment))
for iovcnt
if ((iov_base & (alignment - 1)) != 0)
cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true)
or bdev_io_get_buf(bdev_io, len)
max_len = bdev_io_get_max_buf_len(bdev_io, len)
buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, bdev_io_get_iobuf_cb)
buf = (void *)STAILQ_FIRST(&pool->cache)
sz = spdk_ring_dequeue(pool->pool, (void **)bufs, spdk_min(IOBUF_BATCH_SIZE, spdk_max(pool->cache_size, 1)))
rte_ring_dequeue_burst((struct rte_ring *)ring, objs, count, NULL)
_bdev_io_set_buf(bdev_io, buf, len)
_bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl)
bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs
bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov
bdev_io_pull_data(bdev_io)
bdev_iscsi_get_buf_cb
bdev_iscsi_readv
task = iscsi_read16_task(lun->context, lun->lun_id, lba, nbytes, lun->bdev.blocklen, 0, 0, 0, 0, 0, bdev_iscsi_command_cb, iscsi_io)
iscsi_queue_pdu(iscsi, pdu)
scsi_task_add_data_in_buffer(task, iov[i].iov_len, iov[i].iov_base)
./scripts/rpc.py bdev_iscsi_create -b iSCSI001 -i iqn.2016-06.io.spdk:disk1 --url iscsi://x.x.x.x:3261/iqn.2016-06.io.spdk:disk1/0
URL格式和示例
url: iscsi://[<username>[%<password>]@]<host>[:<port>]/<target-iqn>/<lun>
Example: iscsi://ronnie%password@server/iqn.ronnie.test/1
调用栈:
rpc.py bdev_iscsi_create -b iSCSI0 -i iqn.2016-06.io.spdk:init --url iscsi://127.0.0.1/iqn.2016-06.io.spdk:disk1/0
SPDK_RPC_REGISTER("bdev_iscsi_create", rpc_bdev_iscsi_create, SPDK_RPC_RUNTIME)
create_iscsi_disk(req.name, req.url, req.initiator_iqn, bdev_iscsi_create_cb, request)
req = calloc(1, sizeof(struct bdev_iscsi_conn_req))
req->context = iscsi_create_context(initiator_iqn)
iscsi_init_transport(iscsi, TCP_TRANSPORT)
iscsi_init_tcp_transport
iscsi->drv = &iscsi_transport_tcp
iscsi->current_phase = ISCSI_PDU_LOGIN_CSG_SECNEG
iscsi->next_phase = ISCSI_PDU_LOGIN_NSG_OPNEG
iscsi->max_burst_length = 262144 -> 256KB
iscsi_set_log_level(iscsi, 10);
iscsi->rdma_ack_timeout = atoi(getenv("LIBISCSI_RDMA_ACK_TIMEOUT"))
req->create_cb = cb_fn
iscsi_parse_full_url
iscsi_set_session_type(req->context, ISCSI_SESSION_NORMAL) -> iscsi 会话的类型。发现会话用于查询连接到的门户后面存在哪些目标。普通会话用于登录并对 SCSI LUN 执行 I/O
iscsi_full_connect_async(req->context, iscsi_url->portal, iscsi_url->lun, iscsi_connect_cb, req) -> 异步调用连接 lun,该函数将连接到门户、登录并验证 lun 是否可用
iscsi_connect_async(iscsi, portal, iscsi_connect_cb, ct)
iscsi_connect_cb
iscsi_login_async(iscsi, iscsi_login_cb, ct)
pdu = iscsi_allocate_pdu(iscsi,
pdu = iscsi->drv->new_pdu(iscsi, sizeof(struct iscsi_pdu)) -> iscsi_tcp_new_pdu
pdu->callback = cb
pdu->private_data = private_data
iscsi_queue_pdu(iscsi, pdu)
iscsi_connect_cb
iscsi_inquiry_task(iscsi, req->lun, 1,SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING,255, bdev_iscsi_inquiry_lbp_cb, req)
bdev_iscsi_inquiry_lbp_cb
task = iscsi_inquiry_task(context, req->lun, 1,SCSI_INQUIRY_PAGECODE_BLOCK_LIMITS,255, bdev_iscsi_inquiry_bl_cb, req)
bdev_iscsi_inquiry_bl_cb
scsi_datain_unmarshall
iscsi_readcapacity16_task(context, req->lun, iscsi_readcapacity16_cb, req)
iscsi_readcapacity16_cb
readcap16 = scsi_datain_unmarshall(task)
status = create_iscsi_lun(req, readcap16->returned_lba + 1, readcap16->block_length, &bdev, readcap16->lbppbe)
lun = calloc(sizeof(*lun), 1)
lun->bdev.module = &g_iscsi_bdev_module
lun->bdev.blocklen = block_size;
lun->bdev.blockcnt = num_blocks;
lun->bdev.fn_table = &iscsi_fn_table
spdk_io_device_register(lun, bdev_iscsi_create_cb, bdev_iscsi_destroy_cb, sizeof(struct bdev_iscsi_io_channel)
spdk_bdev_register(&lun->bdev)
lun->no_main_ch_poller = SPDK_POLLER_REGISTER(bdev_iscsi_no_main_ch_poll, lun,
bdev_iscsi_poll_lun
pfd.events = iscsi_which_events(lun->context)
if (poll(&pfd, 1, 0) < 0)
iscsi_service(lun->context, pfd.revents)
complete_conn_req -> req->create_cb -> bdev_iscsi_create_cb
iscsi_set_initiator_username_pwd
TAILQ_INSERT_TAIL(&g_iscsi_conn_req, req, link)
g_conn_poller = SPDK_POLLER_REGISTER(iscsi_bdev_conn_poll, NULL, BDEV_ISCSI_CONNECTION_POLL_US)
创建iscsi块设备重点
iscsi_bdev_conn_poll
TAILQ_FOREACH_SAFE(req, &g_iscsi_conn_req, link, tmp)
pfd.fd = iscsi_get_fd(context)
poll(&pfd, 1, 0)
iscsi_service(context, pfd.revents)
iscsi->drv->service(iscsi, revents) -> iscsi_tcp_service(struct iscsi_context *iscsi, int revents)
if (revents & POLLERR)
if (revents & POLLHUP)
if (iscsi->is_connected == 0 && revents&POLLOUT)
iscsi->is_connected = 1
iscsi->socket_status_cb(iscsi, SCSI_STATUS_GOOD, NULL, iscsi->connect_data) -> iscsi_connect_cb
iscsi_service_reconnect_if_loggedin
if (revents & POLLIN)
iscsi_read_from_socket(iscsi)
if (data_size != 0)
count = iscsi_iovector_readv_writev(iscsi, iovector_in,
if (do_write) {
n = writev(iscsi->fd, (struct iovec*) iov, niov);
} else {
n = readv(iscsi->fd, (struct iovec*) iov, niov);
}
count = recv
iscsi_process_pdu(iscsi, in)
if (revents & POLLOUT)
iscsi_write_to_socket
while (iscsi->outqueue != NULL || iscsi->outqueue_current != NULL)
iscsi->outqueue->outdata.data[0] & 0x3f) != ISCSI_PDU_DATA_OUT)
count = send(iscsi->fd, (void *)(pdu->outdata.data
iscsi_timeout_scan(iscsi) -> check_timeout
iscsi_pdu_data_out_inprocess(iscsi, pdu)
if ((pdu->outdata.data[0] & 0x3f) != ISCSI_PDU_SCSI_REQUEST)
iscsi_process_pdu(struct iscsi_context *iscsi, struct iscsi_in_pdu *in)
enum iscsi_opcode opcode = in->hdr[0] & 0x3f
crc = crc32c(in->hdr, ISCSI_RAW_HEADER_SIZE)
verify data checksum
iscsi_process_pdu_serials(iscsi, in)
for (pdu = iscsi->waitpdu; pdu; pdu = pdu->next)
switch (opcode)
case ISCSI_PDU_DATA_IN:
iscsi_process_scsi_data_in
pdu->callback(iscsi, status, task, pdu->private_data)
static iscsi_transport iscsi_transport_tcp = {
.connect = iscsi_tcp_connect,
iscsi->fd = socket(ai_family, SOCK_STREAM, 0)
iscsi_set_tcp_keepalive(iscsi, iscsi->tcp_keepidle, iscsi->tcp_keepcnt, iscsi->tcp_keepintvl)
setsockopt(iscsi->fd, SOL_SOCKET, SO_KEEPALIVE, (char *)&value, sizeof(value)
set_tcp_sockopt(iscsi->fd, TCP_KEEPCNT, count)
connect(iscsi->fd, &sa->sa, socksize)
.queue_pdu = iscsi_tcp_queue_pdu,
iscsi_add_to_outqueue(iscsi, pdu)
struct iscsi_pdu *current = iscsi->outqueue
if (pdu->outdata.data[0] & ISCSI_PDU_IMMEDIATE) -> 按 CmdSN 的升序排列 pdu。确保具有相同 CmdSN 的数据包按 FIFO 顺序排列。立即 PDU 排在队列前面,其 CmdSN 为出队列中第一个 cmd pdu 的 CmdSN
iscsi_serial32_compare(pdu->cmdsn, current->cmdsn) -> 这将添加 RFC1982 中定义的 32 位串行比较。如果相等,则返回 0;如果 s1 大于 s2,则返回 1;如果 s1 小于 s2,则返回 -1。根据 RFC1982 第 3.2 节,在极少数情况下,比较结果未定义,例如当 s1 = 0 且 s2=2^31 时。这种情况不应该发生在 iSCSI 协议中
last->next = pdu;
.new_pdu = iscsi_tcp_new_pdu,
iscsi_szmalloc(iscsi, size)
void *ptr = iscsi_smalloc(iscsi, size)
ptr = iscsi->smalloc_ptrs[--iscsi->smalloc_free]
or ptr = iscsi_malloc(iscsi, iscsi->smalloc_size)
void * ptr = malloc(size);
if (ptr != NULL) iscsi->mallocs++;
.disconnect = iscsi_tcp_disconnect,
.free_pdu = iscsi_tcp_free_pdu,
.service = iscsi_tcp_service,
.get_fd = iscsi_tcp_get_fd,
.which_events = iscsi_tcp_which_events,
};
/*
* Types of header digest we support. Default is NONE
*/
enum iscsi_header_digest {
ISCSI_HEADER_DIGEST_NONE = 0,
ISCSI_HEADER_DIGEST_NONE_CRC32C = 1,
ISCSI_HEADER_DIGEST_CRC32C_NONE = 2,
ISCSI_HEADER_DIGEST_CRC32C = 3,
ISCSI_HEADER_DIGEST_LAST = ISCSI_HEADER_DIGEST_CRC32C
};
/*
* User-settable options (used with setsockopt).
*/
#define TCP_NODELAY 1 /* Don't delay send to coalesce packets */
#define TCP_MAXSEG 2 /* Set maximum segment size */
#define TCP_CORK 3 /* Control sending of partial frames */
#define TCP_KEEPIDLE 4 /* Start keeplives after this period */
#define TCP_KEEPINTVL 5 /* Interval between keepalives */
#define TCP_KEEPCNT 6 /* Number of keepalives before death */
#define TCP_SYNCNT 7 /* Number of SYN retransmits */
#define TCP_LINGER2 8 /* Life time of orphaned FIN-WAIT-2 state */
#define TCP_DEFER_ACCEPT 9 /* Wake up listener only when data arrive */
#define TCP_WINDOW_CLAMP 10 /* Bound advertised window */
#define TCP_INFO 11 /* Information about this connection. */
#define TCP_QUICKACK 12 /* Bock/reenable quick ACKs. */
#define TCP_CONGESTION 13 /* Congestion control algorithm. */
#define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */
#define TCP_COOKIE_TRANSACTIONS 15 /* TCP Cookie Transactions */
#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/
#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */
#define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */
#define TCP_REPAIR 19 /* TCP sock is under repair right now */
#define TCP_REPAIR_QUEUE 20 /* Set TCP queue to repair */
#define TCP_QUEUE_SEQ 21 /* Set sequence number of repaired queue. */
#define TCP_REPAIR_OPTIONS 22 /* Repair TCP connection options */
#define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */
#define TCP_TIMESTAMP 24 /* TCP time stamp */
#define TCP_NOTSENT_LOWAT 25 /* Limit number of unsent bytes in
write queue. */
#define TCP_CC_INFO 26 /* Get Congestion Control
(optional) info. */
#define TCP_SAVE_SYN 27 /* Record SYN headers for new
connections. */
#define TCP_SAVED_SYN 28 /* Get SYN headers recorded for
connection. */
#define TCP_REPAIR_WINDOW 29 /* Get/set window parameters. */
#define TCP_FASTOPEN_CONNECT 30 /* Attempt FastOpen with connect. */
#define TCP_ULP 31 /* Attach a ULP to a TCP connection. */
#define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions. */
#define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie). */
#define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie. */
#define TCP_ZEROCOPY_RECEIVE 35
#define TCP_INQ 36 /* Notify bytes available to read
as a cmsg on read. */
#define TCP_CM_INQ TCP_INQ
#define TCP_TX_DELAY 37 /* Delay outgoing packets by XX usec. */
#define TCP_REPAIR_ON 1
#define TCP_REPAIR_OFF 0
#define TCP_REPAIR_OFF_NO_WP -1
#define POLLIN 0x0001 /* There is data to read */
#define POLLPRI 0x0002 /* There is urgent data to read */
#define POLLOUT 0x0004 /* Writing now will not block */
#define POLLERR 0x0008 /* Error condition */
#define POLLHUP 0x0010 /* Hung up */
#define POLLNVAL 0x0020 /* Invalid request: fd not open */
enum scsi_status {
SCSI_STATUS_GOOD = 0,
SCSI_STATUS_CHECK_CONDITION = 2,
SCSI_STATUS_CONDITION_MET = 4,
SCSI_STATUS_BUSY = 8,
SCSI_STATUS_RESERVATION_CONFLICT = 0x18,
SCSI_STATUS_TASK_SET_FULL = 0x28,
SCSI_STATUS_ACA_ACTIVE = 0x30,
SCSI_STATUS_TASK_ABORTED = 0x40,
SCSI_STATUS_REDIRECT = 0x101,
SCSI_STATUS_CANCELLED = 0x0f000000,
SCSI_STATUS_ERROR = 0x0f000001,
SCSI_STATUS_TIMEOUT = 0x0f000002
};
enum iscsi_transport_type {
TCP_TRANSPORT = 0,
ISER_TRANSPORT = 1
};
enum iscsi_opcode {
ISCSI_PDU_NOP_OUT = 0x00,
ISCSI_PDU_SCSI_REQUEST = 0x01,
ISCSI_PDU_SCSI_TASK_MANAGEMENT_REQUEST = 0x02,
ISCSI_PDU_LOGIN_REQUEST = 0x03,
ISCSI_PDU_TEXT_REQUEST = 0x04,
ISCSI_PDU_DATA_OUT = 0x05,
ISCSI_PDU_LOGOUT_REQUEST = 0x06,
ISCSI_PDU_NOP_IN = 0x20,
ISCSI_PDU_SCSI_RESPONSE = 0x21,
ISCSI_PDU_SCSI_TASK_MANAGEMENT_RESPONSE = 0x22,
ISCSI_PDU_LOGIN_RESPONSE = 0x23,
ISCSI_PDU_TEXT_RESPONSE = 0x24,
ISCSI_PDU_DATA_IN = 0x25,
ISCSI_PDU_LOGOUT_RESPONSE = 0x26,
ISCSI_PDU_R2T = 0x31,
ISCSI_PDU_ASYNC_MSG = 0x32,
ISCSI_PDU_REJECT = 0x3f,
ISCSI_PDU_NO_PDU = 0xff
};
enum scsi_opcode {
SCSI_OPCODE_TESTUNITREADY = 0x00,
SCSI_OPCODE_READ6 = 0x08,
SCSI_OPCODE_INQUIRY = 0x12,
SCSI_OPCODE_MODESELECT6 = 0x15,
SCSI_OPCODE_RESERVE6 = 0x16,
SCSI_OPCODE_RELEASE6 = 0x17,
SCSI_OPCODE_MODESENSE6 = 0x1a,
SCSI_OPCODE_STARTSTOPUNIT = 0x1b,
SCSI_OPCODE_PREVENTALLOW = 0x1e,
SCSI_OPCODE_READCAPACITY10 = 0x25,
SCSI_OPCODE_READ10 = 0x28,
SCSI_OPCODE_WRITE10 = 0x2A,
SCSI_OPCODE_WRITE_VERIFY10 = 0x2E,
SCSI_OPCODE_VERIFY10 = 0x2F,
SCSI_OPCODE_PREFETCH10 = 0x34,
SCSI_OPCODE_SYNCHRONIZECACHE10 = 0x35,
SCSI_OPCODE_READ_DEFECT_DATA10 = 0x37,
SCSI_OPCODE_WRITE_SAME10 = 0x41,
SCSI_OPCODE_UNMAP = 0x42,
SCSI_OPCODE_READTOC = 0x43,
SCSI_OPCODE_SANITIZE = 0x48,
SCSI_OPCODE_MODESELECT10 = 0x55,
SCSI_OPCODE_MODESENSE10 = 0x5A,
SCSI_OPCODE_PERSISTENT_RESERVE_IN = 0x5E,
SCSI_OPCODE_PERSISTENT_RESERVE_OUT = 0x5F,
SCSI_OPCODE_EXTENDED_COPY = 0x83,
SCSI_OPCODE_RECEIVE_COPY_RESULTS = 0x84,
SCSI_OPCODE_READ16 = 0x88,
SCSI_OPCODE_COMPARE_AND_WRITE = 0x89,
SCSI_OPCODE_WRITE16 = 0x8A,
SCSI_OPCODE_ORWRITE = 0x8B,
SCSI_OPCODE_WRITE_VERIFY16 = 0x8E,
SCSI_OPCODE_VERIFY16 = 0x8F,
SCSI_OPCODE_PREFETCH16 = 0x90,
SCSI_OPCODE_SYNCHRONIZECACHE16 = 0x91,
SCSI_OPCODE_WRITE_SAME16 = 0x93,
SCSI_OPCODE_WRITE_ATOMIC16 = 0x9C,
SCSI_OPCODE_SERVICE_ACTION_IN = 0x9E,
SCSI_OPCODE_REPORTLUNS = 0xA0,
SCSI_OPCODE_MAINTENANCE_IN = 0xA3,
SCSI_OPCODE_READ12 = 0xA8,
SCSI_OPCODE_WRITE12 = 0xAA,
SCSI_OPCODE_WRITE_VERIFY12 = 0xAE,
SCSI_OPCODE_VERIFY12 = 0xAF,
SCSI_OPCODE_READ_DEFECT_DATA12 = 0xB7
};
SPDK和libiscsi源码
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。