staticchar *g_bdev_name = "Malloc0"; /* * Usage function for printing parameters that are specific to this application */ staticvoid hello_bdev_usage(void) { printf(" -b <bdev> name of the bdev to use\n"); }
/* * This function is called to parse the parameters that are specific to this application */ staticint hello_bdev_parse_arg(int ch, char *arg) { switch (ch) { case'b': g_bdev_name = arg; break; default: return -EINVAL; } return0; } spdk_app_parse_args(argc, argv, &opts, "b:", NULL, hello_bdev_parse_arg, hello_bdev_usage) hello_context.bdev_name = g_bdev_name;
rc = app_json_config_read(json_config_file, ctx); if (rc) { goto fail; }
/* Capture subsystems array */ rc = spdk_json_find_array(ctx->values, "subsystems", NULL, &ctx->subsystems); switch (rc) { case0: /* Get first subsystem */ ctx->subsystems_it = spdk_json_array_first(ctx->subsystems); if (ctx->subsystems_it == NULL) { SPDK_NOTICELOG("'subsystems' configuration is empty\n"); } break; case -EPROTOTYPE: SPDK_ERRLOG("Invalid JSON configuration: not enclosed in {}.\n"); goto fail; case -ENOENT: SPDK_WARNLOG("No 'subsystems' key JSON configuration file.\n"); break; case -EDOM: SPDK_ERRLOG("Invalid JSON configuration: 'subsystems' should be an array.\n"); goto fail; default: SPDK_ERRLOG("Failed to parse JSON configuration.\n"); goto fail; }
/* If rpc_addr is not an Unix socket use default address as prefix. */ if (rpc_addr == NULL || rpc_addr[0] != '/') { rpc_addr = SPDK_DEFAULT_RPC_ADDR; }
/* FIXME: rpc client should use socketpair() instead of this temporary socket nonsense */ rc = snprintf(ctx->rpc_socket_path_temp, sizeof(ctx->rpc_socket_path_temp), "%s.%d_config", rpc_addr, getpid()); if (rc >= (int)sizeof(ctx->rpc_socket_path_temp)) { SPDK_ERRLOG("Socket name create failed\n"); goto fail; }
rc = spdk_rpc_initialize(ctx->rpc_socket_path_temp); if (rc) { goto fail; }
ctx->client_conn = spdk_jsonrpc_client_connect(ctx->rpc_socket_path_temp, AF_UNIX); if (ctx->client_conn == NULL) { SPDK_ERRLOG("Failed to connect to '%s'\n", ctx->rpc_socket_path_temp); goto fail; }
(gdb) bt #0 rpc_bdev_malloc_create (request=0xcac1f474c379e400, params=0x555555cc9570) at bdev_malloc_rpc.c:49 #10x00005555556b0a53 in jsonrpc_handler(request=0x555555cc04e0, method=0x555555c648e0, params=0x555555c64900) at rpc.c:124 #2 0x00005555556b2c5e in jsonrpc_server_handle_request(request=0x555555cc04e0, method=0x555555c648e0, params=0x555555c64900) at jsonrpc_server_tcp.c:222 #3 0x00005555556b1665 in parse_single_request(request=0x555555cc04e0, values=0x555555c64880) at jsonrpc_server.c:75 #4 0x00005555556b1c68 in jsonrpc_parse_request(conn=0x7ffff5f7e040, json=0x7ffff5f7e058, size=172) at jsonrpc_server.c:205 #5 0x00005555556b2eaa in jsonrpc_server_conn_recv(conn=0x7ffff5f7e040) at jsonrpc_server_tcp.c:284 #6 0x00005555556b3297 in spdk_jsonrpc_server_poll(server=0x7ffff5f7e010) at jsonrpc_server_tcp.c:402 #7 0x00005555556b0d59 in spdk_rpc_accept() at rpc.c:213 #8 0x00005555556a13c4 in rpc_subsystem_poll(arg=0x0) at rpc.c:21 #9 0x00005555556a82fd in thread_execute_timed_poller(thread=0x555555c9ec00, poller=0x555555cbf2c0, now=41542509569737) at thread.c:970 #10 0x00005555556a8613 in thread_poll(thread=0x555555c9ec00, max_msgs=0, now=41542509569737) at thread.c:1060 #11 0x00005555556a8837 in spdk_thread_poll(thread=0x555555c9ec00, max_msgs=0, now=41542509569737) at thread.c:1119 #12 0x000055555566d309 in _reactor_run(reactor=0x555555c7b780) at reactor.c:914 #13 0x000055555566d3fb in reactor_run(arg=0x555555c7b780) at reactor.c:952 #14 0x000055555566d887 in spdk_reactors_start() at reactor.c:1068 #15 0x0000555555669c5d in spdk_app_start(opts_user=0x7fffffffdea0, start_fn=0x55555556e1fb <hello_start>, arg1=0x7fffffffde40) at app.c:779 #16 0x000055555556e5d9 in main(argc=5, argv=0x7fffffffe078) at hello_bdev.c:306
// 函数调用栈 (gdb) bt #0 _sw_accel_copy_iovs (dst_iovs=0x555555cca0b8, dst_iovcnt=1, src_iovs=0x555555cca0a8, src_iovcnt=1) at accel_sw.c:115 #10x0000555555696577 in sw_accel_submit_tasks(ch=0x555555dadfd0, accel_task=0x555555cc9fb0) at accel_sw.c:455 #2 0x000055555568e5a2 in accel_submit_task(accel_ch=0x555555e51190, task=0x555555cc9fb0) at accel.c:305 #3 0x000055555568e723 in spdk_accel_submit_copy(ch=0x555555e51130, dst=0x200016600000, src=0x2000162efd00, nbytes=512, flags=0, cb_fn=0x55555556e83f <malloc_done>, cb_arg=0x200010aa2ae0) at accel.c:340 #4 0x000055555556eec4 in bdev_malloc_writev(mdisk=0x555555cc95c0, ch=0x555555e51130, task=0x200010aa2ae0, iov=0x200010aa2710, iovcnt=1, len=512, offset=0, md_buf=0x0, md_len=0, md_offset=0) at bdev_malloc.c:277 #5 0x000055555556f43b in _bdev_malloc_submit_request(mch=0x555555e50e60, bdev_io=0x200010aa2700) at bdev_malloc.c:382 #6 0x000055555556f69c in bdev_malloc_submit_request(ch=0x555555e50e00, bdev_io=0x200010aa2700) at bdev_malloc.c:457 #7 0x0000555555674c66 in bdev_submit_request(bdev=0x555555cc95c0, ioch=0x555555e50e00, bdev_io=0x200010aa2700) at bdev.c:1297 #8 0x000055555567784d in bdev_io_do_submit(bdev_ch=0x555555e50d50, bdev_io=0x200010aa2700) at bdev.c:2477 #9 0x000055555567947a in _bdev_io_submit(ctx=0x200010aa2700) at bdev.c:3173 #10 0x0000555555679a48 in bdev_io_submit(bdev_io=0x200010aa2700) at bdev.c:3293 #11 0x000055555567e0f7 in bdev_write_blocks_with_md(desc=0x555555e50b60, ch=0x555555e50cf0, buf=0x2000162efd00, md_buf=0x0, offset_blocks=0, num_blocks=1, cb=0x55555556dd5e <write_complete>, cb_arg=0x7fffffffde40) at bdev.c:5195 #12 0x000055555567e1df in spdk_bdev_write_blocks(desc=0x555555e50b60, ch=0x555555e50cf0, buf=0x2000162efd00, offset_blocks=0, num_blocks=1, cb=0x55555556dd5e <write_complete>, cb_arg=0x7fffffffde40) at bdev.c:5219 #13 0x000055555567e188 in spdk_bdev_write(desc=0x555555e50b60, ch=0x555555e50cf0, buf=0x2000162efd00, offset=0, nbytes=512, cb=0x55555556dd5e <write_complete>, cb_arg=0x7fffffffde40) at bdev.c:5211 #14 0x000055555556decc in hello_write(arg=0x7fffffffde40) at hello_bdev.c:139 #15 0x000055555556e4d3 in hello_start(arg1=0x7fffffffde40) at hello_bdev.c:276 #16 0x00005555556683f7 in app_start_application() at app.c:264 #17 0x0000555555668478 in app_start_rpc(rc=0, arg1=0x0) at app.c:285 #18 0x000055555569f259 in app_json_config_load_done(ctx=0x555555c9f000, rc=0) at json_config.c:111 #19 0x000055555569ffa6 in app_json_config_load_subsystem(_ctx=0x555555c9f000) at json_config.c:473 #20 0x00005555556a7bd0 in msg_queue_run_batch(thread=0x555555c9ec00, max_msgs=8) at thread.c:804 #21 0x00005555556a8528 in thread_poll(thread=0x555555c9ec00, max_msgs=0, now=121496004745246) at thread.c:1026 #22 0x00005555556a8837 in spdk_thread_poll(thread=0x555555c9ec00, max_msgs=0, now=121496004745246) at thread.c:1119 #23 0x000055555566d309 in _reactor_run(reactor=0x555555c7b780) at reactor.c:914 #24 0x000055555566d3fb in reactor_run(arg=0x555555c7b780) at reactor.c:952 #25 0x000055555566d887 in spdk_reactors_start() at reactor.c:1068 #26 0x0000555555669c5d in spdk_app_start(opts_user=0x7fffffffdea0, start_fn=0x55555556e1fb <hello_start>, arg1=0x7fffffffde40) at app.c:779 #27 0x000055555556e5d9 in main(argc=5, argv=0x7fffffffe078) at hello_bdev.c:306
staticinlinevoid bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io) { /* After a request is submitted to a bdev module, the ownership of an accel sequence * associated with that bdev_io is transferred to the bdev module. So, clear the internal * sequence pointer to make sure we won't touch it anymore. */ if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); bdev_io->internal.accel_sequence = NULL; }
/** * Function table for a block device backend. * * The backend block device function table provides a set of APIs to allow * communication with a backend. The main commands are read/write API * calls for I/O via submit_request. */ structspdk_bdev_fn_table { /** Destroy the backend block device object */ int (*destruct)(void *ctx);
/** Process the IO. */ void (*submit_request)(struct spdk_io_channel *ch, struct spdk_bdev_io *);
/** Check if the block device supports a specific I/O type. */ bool (*io_type_supported)(void *ctx, enum spdk_bdev_io_type);
/** Get an I/O channel for the specific bdev for the calling thread. */ structspdk_io_channel *(*get_io_channel)(void *ctx);
/** * Output driver-specific information to a JSON stream. Optional - may be NULL. * * The JSON write context will be initialized with an open object, so the bdev * driver should write a name (based on the driver name) followed by a JSON value * (most likely another nested object). */ int (*dump_info_json)(void *ctx, struct spdk_json_write_ctx *w);
/** * Output bdev-specific RPC configuration to a JSON stream. Optional - may be NULL. * * This function should only be implemented for bdevs which can be configured * independently of other bdevs. For example, RPCs to create a bdev for an NVMe * namespace may not be generated by this function, since enumerating an NVMe * namespace requires attaching to an NVMe controller, and that controller may * contain multiple namespaces. The spdk_bdev_module's config_json function should * be used instead for these cases. * * The JSON write context will be initialized with an open object, so the bdev * driver should write all data necessary to recreate this bdev by invoking * constructor method. No other data should be written. */ void (*write_config_json)(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w);
/** Get spin-time per I/O channel in microseconds. * Optional - may be NULL. */ uint64_t (*get_spin_time)(struct spdk_io_channel *ch);
/** Get bdev module context. */ void *(*get_module_ctx)(void *ctx);
/** Get memory domains used by bdev. Optional - may be NULL. * Vbdev module implementation should call \ref spdk_bdev_get_memory_domains for underlying bdev. * Vbdev module must inspect types of memory domains returned by base bdev and report only those * memory domains that it can work with. */ int (*get_memory_domains)(void *ctx, struct spdk_memory_domain **domains, int array_size);
/** * Reset I/O statistics specific for this bdev context. */ void (*reset_device_stat)(void *ctx);
/** * Dump I/O statistics specific for this bdev context. */ void (*dump_device_stat_json)(void *ctx, struct spdk_json_write_ctx *w);
/** Check if bdev can handle spdk_accel_sequence to handle I/O of specific type. */ bool (*accel_sequence_supported)(void *ctx, enum spdk_bdev_io_type type); };
int create_malloc_disk(struct spdk_bdev **bdev, conststruct malloc_bdev_opts *opts) /* * Allocate the large backend memory buffer from pinned memory. * * TODO: need to pass a hint so we know which socket to allocate * from on multi-socket systems. */ mdisk->malloc_buf = spdk_zmalloc(opts->num_blocks * block_size, 2 * 1024 * 1024, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); mdisk->disk.max_copy = 0; mdisk->disk.ctxt = mdisk; mdisk->disk.fn_table = &malloc_fn_table; mdisk->disk.module = &malloc_if;
spdk/doc/bdev.md There are two ways to create block device based on NVMe device in SPDK. First way is to connect local PCIe drive and second one is to connect NVMe-oF device. In both cases user should use bdev_nvme_attach_controller RPC command to achieve that.
Example commands rpc.py bdev_nvme_attach_controller -b NVMe1 -t PCIe -a 0000:01:00.0 This command will create NVMe bdev of physical device in the system. rpc.py bdev_nvme_attach_controller -b Nvme0 -t RDMA -a 192.168.100.1 -f IPv4 -s 4420 -n nqn.2016-06.io.spdk:cnode1 This command will create NVMe bdev of NVMe-oF resource.
To remove an NVMe controller use the bdev_nvme_detach_controller command. rpc.py bdev_nvme_detach_controller Nvme0 This command will remove NVMe bdev named Nvme0.
Thread 1"reactor_0" hit Breakpoint 1, bdev_submit_request (bdev=0x555555cd8350, ioch=0x555555cca200, bdev_io=0x200010aa2700) at bdev.c:1291 1291if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || (gdb) bt #0 bdev_submit_request (bdev=0x555555cd8350, ioch=0x555555cca200, bdev_io=0x200010aa2700) at bdev.c:1291 #10x000055555567784d in bdev_io_do_submit (bdev_ch=0x555555cca150, bdev_io=0x200010aa2700) at bdev.c:2477 #20x000055555567947a in _bdev_io_submit (ctx=0x200010aa2700) at bdev.c:3173 #30x0000555555679a48 in bdev_io_submit (bdev_io=0x200010aa2700) at bdev.c:3293 #40x000055555567e0f7 in bdev_write_blocks_with_md (desc=0x555555e600d0, ch=0x555555cca0f0, buf=0x200003aeb340, md_buf=0x0, offset_blocks=0, num_blocks=1, cb=0x55555556dd5e <write_complete>, cb_arg=0x7fffffffde40) at bdev.c:5195 #50x000055555567e1df in spdk_bdev_write_blocks (desc=0x555555e600d0, ch=0x555555cca0f0, buf=0x200003aeb340, offset_blocks=0, num_blocks=1, cb=0x55555556dd5e <write_complete>, cb_arg=0x7fffffffde40) at bdev.c:5219 #60x000055555567e188 in spdk_bdev_write (desc=0x555555e600d0, ch=0x555555cca0f0, buf=0x200003aeb340, offset=0, nbytes=512, cb=0x55555556dd5e <write_complete>, cb_arg=0x7fffffffde40) at bdev.c:5211 #70x000055555556decc in hello_write (arg=0x7fffffffde40) at hello_bdev.c:139 #80x000055555556e4d3 in hello_start (arg1=0x7fffffffde40) at hello_bdev.c:276 #90x00005555556683f7 in app_start_application () at app.c:264 #100x0000555555668478 in app_start_rpc (rc=0, arg1=0x0) at app.c:285 #110x000055555569f259 in app_json_config_load_done (ctx=0x555555c9f000, rc=0) at json_config.c:111 #120x000055555569ffa6 in app_json_config_load_subsystem (_ctx=0x555555c9f000) at json_config.c:473 #130x00005555556a7bd0 in msg_queue_run_batch (thread=0x555555c9ec00, max_msgs=8) at thread.c:804 #140x00005555556a8528 in thread_poll (thread=0x555555c9ec00, max_msgs=0, now=9579853985352) at thread.c:1026 #150x00005555556a8837 in spdk_thread_poll (thread=0x555555c9ec00, max_msgs=0, now=9579853985352) at thread.c:1119 #160x000055555566d309 in _reactor_run (reactor=0x555555c7b780) at reactor.c:914 #170x000055555566d3fb in reactor_run (arg=0x555555c7b780) at reactor.c:952 #180x000055555566d887 in spdk_reactors_start () at reactor.c:1068 #190x0000555555669c5d in spdk_app_start (opts_user=0x7fffffffdea0, start_fn=0x55555556e1fb <hello_start>, arg1=0x7fffffffde40) at app.c:779 #200x000055555556e5d9 in main (argc=5, argv=0x7fffffffe078) at hello_bdev.c:306 (gdb) n 1292 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { (gdb) n 1297 bdev->fn_table->submit_request(ioch, bdev_io); (gdb) p bdev->fn_table $1 = (conststruct spdk_bdev_fn_table *) 0x555555a10c80 <nvmelib_fn_table>
Thread 1"reactor_0" hit Breakpoint 2, nvme_disk_create (disk=0x770000007c, base_name=0x0, ctrlr=0x0, ns=0x1, prchk_flags=57, ctx=0x9) at bdev_nvme.c:3274 3274 { (gdb) bt #0 nvme_disk_create (disk=0x770000007c, base_name=0x0, ctrlr=0x0, ns=0x1, prchk_flags=57, ctx=0x9) at bdev_nvme.c:3274 #10x000055555557ab49 in nvme_bdev_create(nvme_ctrlr=0x555555cd7b10, nvme_ns=0x555555cd82d0) at bdev_nvme.c:3429 #2 0x000055555557b883 in nvme_ctrlr_populate_namespace(nvme_ctrlr=0x555555cd7b10, nvme_ns=0x555555cd82d0) at bdev_nvme.c:3752 #3 0x000055555557bdac in nvme_ctrlr_populate_namespaces(nvme_ctrlr=0x555555cd7b10, ctx=0x555555cc9f50) at bdev_nvme.c:3911 #4 0x000055555557cdcf in nvme_ctrlr_create_done(nvme_ctrlr=0x555555cd7b10, ctx=0x555555cc9f50) at bdev_nvme.c:4387 #5 0x000055555557d7cd in nvme_ctrlr_create(ctrlr=0x2000162ec0c0, name=0x555555cc0650"Nvme0", trid=0x555555cc9f78, ctx=0x555555cc9f50) at bdev_nvme.c:4628 #6 0x000055555557e779 in connect_attach_cb(cb_ctx=0x555555cca1c0, trid=0x2000162ec0e8, ctrlr=0x2000162ec0c0, opts=0x2000162ed6c8) at bdev_nvme.c:5054 #7 0x00005555555f5271 in nvme_ctrlr_poll_internal(ctrlr=0x2000162ec0c0, probe_ctx=0x555555cca520) at nvme.c:737 #8 0x00005555555f743a in spdk_nvme_probe_poll_async(probe_ctx=0x555555cca520) at nvme.c:1510 #9 0x000055555557e856 in bdev_nvme_async_poll(arg=0x555555cc9f50) at bdev_nvme.c:5089 #10 0x00005555556a82fd in thread_execute_timed_poller(thread=0x555555c9ec00, poller=0x555555cd7940, now=13419228290350) at thread.c:970 #11 0x00005555556a8613 in thread_poll(thread=0x555555c9ec00, max_msgs=0, now=13419228290350) at thread.c:1060 #12 0x00005555556a8837 in spdk_thread_poll(thread=0x555555c9ec00, max_msgs=0, now=13419228290350) at thread.c:1119 #13 0x000055555566d309 in _reactor_run(reactor=0x555555c7b780) at reactor.c:914 #14 0x000055555566d3fb in reactor_run(arg=0x555555c7b780) at reactor.c:952 --Type <RET> for more, q to quit, c to continue without paging-- #15 0x000055555566d887 in spdk_reactors_start() at reactor.c:1068 #16 0x0000555555669c5d in spdk_app_start(opts_user=0x7fffffffdea0, start_fn=0x55555556e1fb <hello_start>, arg1=0x7fffffffde40) at app.c:779 #17 0x000055555556e5d9 in main(argc=5, argv=0x7fffffffe078) at hello_bdev.c:306
// -b Nvme0n1的原因 (gdb) p disk->name $2 = 0x555555c7a0f0"Nvme0n1"
// env.h /** * Enumerate all PCI devices supported by the provided driver and try to * attach those that weren't attached yet. The provided callback will be * called for each such device and its return code will decide whether that * device is attached or not. Attached devices have to be manually detached * with spdk_pci_device_detach() to be attach-able again. * * During enumeration all registered pci devices with exposed access to * userspace are getting probed internally unless not explicitly specified * on denylist. Because of that it becomes not possible to either use such * devices with another application or unbind the driver (e.g. vfio). * * 2s asynchronous delay is introduced to avoid race conditions between * user space software initialization and in-kernel device handling for * newly inserted devices. Subsequent enumerate call after the delay * shall allow for a successful device attachment. * * \param driver Driver for a specific device type. * \param enum_cb Callback to be called for each non-attached PCI device. * \param enum_ctx Additional context passed to the callback function. * * \return -1 if an internal error occurred or the provided callback returned -1, * 0 otherwise */ intspdk_pci_enumerate(struct spdk_pci_driver *driver, spdk_pci_enum_cb enum_cb, void *enum_ctx);
/** * Allocate dma/sharable memory based on a given dma_flg. It is a memory buffer * with the given size, alignment and socket id. * * \param size Size in bytes. * \param align If non-zero, the allocated buffer is aligned to a multiple of * align. In this case, it must be a power of two. The returned buffer is always * aligned to at least cache line size. * \param phys_addr **Deprecated**. Please use spdk_vtophys() for retrieving physical * addresses. A pointer to the variable to hold the physical address of * the allocated buffer is passed. If NULL, the physical address is not returned. * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY * for any socket. * \param flags Combination of SPDK_MALLOC flags (\ref SPDK_MALLOC_DMA, \ref SPDK_MALLOC_SHARE). * At least one flag must be specified. * * \return a pointer to the allocated memory buffer. */ void *spdk_malloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags);
/** * Enumerate the bus indicated by the transport ID and attach the userspace NVMe * driver to each device found if desired. * * This function is not thread safe and should only be called from one thread at * a time while no other threads are actively using any NVMe devices. * * If called from a secondary process, only devices that have been attached to * the userspace driver in the primary process will be probed. * * If called more than once, only devices that are not already attached to the * SPDK NVMe driver will be reported. * * To stop using the the controller and release its associated resources, * call spdk_nvme_detach() with the spdk_nvme_ctrlr instance from the attach_cb() * function. * * \param trid The transport ID indicating which bus to enumerate. If the trtype * is PCIe or trid is NULL, this will scan the local PCIe bus. If the trtype is * RDMA, the traddr and trsvcid must point at the location of an NVMe-oF discovery * service. * \param cb_ctx Opaque value which will be passed back in cb_ctx parameter of * the callbacks. * \param probe_cb will be called once per NVMe device found in the system. * \param attach_cb will be called for devices for which probe_cb returned true * once that NVMe controller has been attached to the userspace driver. * \param remove_cb will be called for devices that were attached in a previous * spdk_nvme_probe() call but are no longer attached to the system. Optional; * specify NULL if removal notices are not desired. * * \return 0 on success, -1 on failure. */ intspdk_nvme_probe(conststruct spdk_nvme_transport_id *trid, void *cb_ctx, spdk_nvme_probe_cb probe_cb, spdk_nvme_attach_cb attach_cb, spdk_nvme_remove_cb remove_cb);
/** * Allocate an I/O queue pair (submission and completion queue). * * This function by default also performs any connection activities required for * a newly created qpair. To avoid that behavior, the user should set the create_only * flag in the opts structure to true. * * Each queue pair should only be used from a single thread at a time (mutual * exclusion must be enforced by the user). * * \param ctrlr NVMe controller for which to allocate the I/O queue pair. * \param opts I/O qpair creation options, or NULL to use the defaults as returned * by spdk_nvme_ctrlr_get_default_io_qpair_opts(). * \param opts_size Must be set to sizeof(struct spdk_nvme_io_qpair_opts), or 0 * if opts is NULL. * * \return a pointer to the allocated I/O queue pair. */ struct spdk_nvme_qpair *spdk_nvme_ctrlr_alloc_io_qpair(struct spdk_nvme_ctrlr *ctrlr, conststruct spdk_nvme_io_qpair_opts *opts, size_t opts_size);
// 省略一些成员 /** * NVMe I/O queue pair initialization options. * * These options may be passed to spdk_nvme_ctrlr_alloc_io_qpair() to configure queue pair * options at queue creation time. * * The user may retrieve the default I/O queue pair creation options for a controller using * spdk_nvme_ctrlr_get_default_io_qpair_opts(). */ structspdk_nvme_io_qpair_opts { /** * Queue priority for weighted round robin arbitration. If a different arbitration * method is in use, pass 0. */ enumspdk_nvme_qprioqprio;
/** * The queue depth of this NVMe I/O queue. Overrides spdk_nvme_ctrlr_opts::io_queue_size. */ uint32_t io_queue_size;
/** * The number of requests to allocate for this NVMe I/O queue. * * Overrides spdk_nvme_ctrlr_opts::io_queue_requests. * * This should be at least as large as io_queue_size. * * A single I/O may allocate more than one request, since splitting may be * necessary to conform to the device's maximum transfer size, PRP list * compatibility requirements, or driver-assisted striping. */ uint32_t io_queue_requests;
/** * \brief Submits a read I/O to the specified NVMe namespace. * * The command is submitted to a qpair allocated by spdk_nvme_ctrlr_alloc_io_qpair(). * The user must ensure that only one thread submits I/O on a given qpair at any * given time. * * \param ns NVMe namespace to submit the read I/O. * \param qpair I/O queue pair to submit the request. * \param payload Virtual address pointer to the data payload. * \param lba Starting LBA to read the data. * \param lba_count Length (in sectors) for the read operation. * \param cb_fn Callback function to invoke when the I/O is completed. * \param cb_arg Argument to pass to the callback function. * \param io_flags Set flags, defined in nvme_spec.h, for this I/O. * * \return 0 if successfully submitted, negated errnos on the following error conditions: * -EINVAL: The request is malformed. * -ENOMEM: The request cannot be allocated. * -ENXIO: The qpair is failed at the transport level. */ intspdk_nvme_ns_cmd_read(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *payload, uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags);
/** * Process any outstanding completions for I/O submitted on a queue pair. * * This call is non-blocking, i.e. it only processes completions that are ready * at the time of this function call. It does not wait for outstanding commands * to finish. * * For each completed command, the request's callback function will be called if * specified as non-NULL when the request was submitted. * * The caller must ensure that each queue pair is only used from one thread at a * time. * * This function may be called at any point while the controller is attached to * the SPDK NVMe driver. * * \sa spdk_nvme_cmd_cb * * \param qpair Queue pair to check for completions. * \param max_completions Limit the number of completions to be processed in one * call, or 0 for unlimited. * * \return number of completions processed (may be 0) or negated on error. -ENXIO * in the special case that the qpair is failed at the transport layer. */ int32_tspdk_nvme_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions);
/** * True if the request is in the queued_req list. */ uint8_t queued : 1; uint8_t reserved : 6;
/** * Number of children requests still outstanding for this * request which was split into multiple child requests. */ uint16_t num_children;
/** * Offset in bytes from the beginning of payload for this request. * This is used for I/O commands that are split into multiple requests. */ uint32_t payload_offset; uint32_t md_offset;
uint32_t payload_size;
/** * Timeout ticks for error injection requests, can be extended in future * to support per-request timeout feature. */ uint64_t timeout_tsc;
/** * Data payload for this request's command. */ structnvme_payloadpayload;
/* * The value of spdk_get_ticks() when the request was submitted to the hardware. * Only set if ctrlr->timeout_enabled is true. */ uint64_t submit_tick;
/** * The active admin request can be moved to a per process pending * list based on the saved pid to tell which process it belongs * to. The cpl saves the original completion information which * is used in the completion callback. * NOTE: these below two fields are only used for admin request. */ pid_t pid; structspdk_nvme_cplcpl;
uint32_t md_size;
/** * The following members should not be reordered with members * above. These members are only needed when splitting * requests which is done rarely, and the driver is careful * to not touch the following fields until a split operation is * needed, to avoid touching an extra cacheline. */
/** * Points to the outstanding child requests for a parent request. * Only valid if a request was split into multiple children * requests, and is not initialized for non-split requests. */ TAILQ_HEAD(, nvme_request) children;
/** * Linked-list pointers for a child request in its parent's list. */ TAILQ_ENTRY(nvme_request) child_tailq;
/** * Points to a parent request if part of a split request, * NULL otherwise. */ structnvme_request *parent;
/** * Completion status for a parent request. Initialized to all 0's * (SUCCESS) before child requests are submitted. If a child * request completes with error, the error status is copied here, * to ensure that the parent request is also completed with error * status once all child requests are completed. */ structspdk_nvme_cplparent_status;
/** * The user_cb_fn and user_cb_arg fields are used for holding the original * callback data when using nvme_allocate_request_user_copy. */ spdk_nvme_cmd_cb user_cb_fn; void *user_cb_arg; void *user_buffer; };
C
1 2 3 4 5 6 7 8 9 10 11
struct __attribute__((packed)) spdk_nvme_ctrlr_data { /* bytes 0-255: controller capabilities and features */
/** * Submit a write I/O to the specified NVMe namespace. * * The command is submitted to a qpair allocated by spdk_nvme_ctrlr_alloc_io_qpair(). * The user must ensure that only one thread submits I/O on a given qpair at any * given time. * * \param ns NVMe namespace to submit the write I/O. * \param qpair I/O queue pair to submit the request. * \param lba Starting LBA to write the data. * \param lba_count Length (in sectors) for the write operation. * \param cb_fn Callback function to invoke when the I/O is completed. * \param cb_arg Argument to pass to the callback function. * \param io_flags Set flags, defined in nvme_spec.h, for this I/O. * \param reset_sgl_fn Callback function to reset scattered payload. * \param next_sge_fn Callback function to iterate each scattered payload memory * segment. * * \return 0 if successfully submitted, negated errnos on the following error conditions: * -EINVAL: The request is malformed. * -ENOMEM: The request cannot be allocated. * -ENXIO: The qpair is failed at the transport level. */ intspdk_nvme_ns_cmd_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, spdk_nvme_req_reset_sgl_cb reset_sgl_fn, spdk_nvme_req_next_sge_cb next_sge_fn);
C
SPDK Blobstore: A Look Inside the NVM Optimized Allocator
/* * Setup a few specifics before we init, for most SPDK cmd line * apps, the config file will be passed in as an arg but to make * this example super simple we just hardcode it. We also need to * specify a name for the app. */ opts.name = "hello_blob"; opts.json_config_file = argv[1];
Message passing is efficient, but it results in asynchronous code. Unfortunately, asynchronous code is a challenge in C. It’s often implemented by passing function pointers that are called when an operation completes. This chops up the code so that it isn’t easy to follow, especially through logic branches. The best solution is to use a language with support for futures and promises, such as C++, Rust, Go, or almost any other higher level language. However, SPDK is a low level library and requires very wide compatibility and portability, so we’ve elected to stay with plain old C. We do have a few recommendations to share, though. For simple callback chains, it’s easiest if you write the functions from bottom to top. By that we mean if function foo performs some asynchronous operation and when that completes function bar is called, then function bar performs some operation that calls function baz on completion, a good way to write it is as such:
1 2 3 4 5 6 7 8 9 10 11
voidbaz(void *ctx) { ... }
voidbar(void *ctx) { async_op(baz, ctx); }
voidfoo(void *ctx) { async_op(bar, ctx); }
C
Don’t split these functions up - keep them as a nice unit that can be read from bottom to top.
/** * Start the framework. * * Before calling this function, opts must be initialized by * spdk_app_opts_init(). Once started, the framework will call start_fn on * an spdk_thread running on the current system thread with the * argument provided. * * If opts->delay_subsystem_init is set * (e.g. through --wait-for-rpc flag in spdk_app_parse_args()) * this function will only start a limited RPC server accepting * only a few RPC commands - mostly related to pre-initialization. * With this option, the framework won't be started and start_fn * won't be called until the user sends an `rpc_framework_start_init` * RPC command, which marks the pre-initialization complete and * allows start_fn to be finally called. * * This call will block until spdk_app_stop() is called. If an error * condition occurs during the initialization code within spdk_app_start(), * this function will immediately return before invoking start_fn. * * \param opts_user Initialization options used for this application. It should not be * NULL. And the opts_size value inside the opts structure should not be zero. * \param start_fn Entry point that will execute on an internally created thread * once the framework has been started. * \param ctx Argument passed to function start_fn. * * \return 0 on success or non-zero on failure. */ intspdk_app_start(struct spdk_app_opts *opts_user, spdk_msg_fn start_fn, void *ctx);
/** * Perform final shutdown operations on an application using the event framework. */ voidspdk_app_fini(void);
/** * Start shutting down the framework. * * Typically this function is not called directly, and the shutdown process is * started implicitly by a process signal. But in applications that are using * SPDK for a subset of its process threads, this function can be called in lieu * of a signal. */ voidspdk_app_start_shutdown(void); /** * Stop the framework. * * This does not wait for all threads to exit. Instead, it kicks off the shutdown * process and returns. Once the shutdown process is complete, spdk_app_start() * will return. * * \param rc The rc value specified here will be returned to caller of spdk_app_start(). */ voidspdk_app_stop(int rc);
// Create a blobstore block device from a bdev. rc = spdk_bdev_create_bs_dev_ext("Malloc0", base_bdev_event_cb, NULL, &bs_dev);
// Initialize a blobstore on the given device. spdk_bs_init(bs_dev, NULL, bs_init_complete, hello_context);
// Create a new blob with default option values on the given blobstore. The new blob id will be passed to the callback function. spdk_bs_create_blob(hello_context->bs, blob_create_complete, hello_context);
// Open a blob from the given blobstore. spdk_bs_open_blob(hello_context->bs, hello_context->blobid, open_complete, hello_context);
// Resize a blob to 'sz' clusters. These changes are not persisted to disk until spdk_bs_md_sync_blob() is called. If called before previous resize finish, it will fail with errno -EBUSY spdk_blob_resize(hello_context->blob, free, resize_complete, hello_context);
// Sync a blob. Make a blob persistent. This applies to open, resize, set xattr, and remove xattr. These operations will not be persistent until the blob has been synced. spdk_blob_sync_md(hello_context->blob, sync_complete, hello_context);
// Allocate an I/O channel for the given blobstore. hello_context->channel = spdk_bs_alloc_io_channel(hello_context->bs);
// Write data to a blob. spdk_blob_io_write(hello_context->blob, hello_context->channel, hello_context->write_buff, 0, 1, write_complete, hello_context);
// Read data from a blob. spdk_blob_io_read(hello_context->blob, hello_context->channel, hello_context->read_buff, 0, 1, read_complete, hello_context);
// Close a blob. This will automatically sync. spdk_blob_close(hello_context->blob, delete_blob, hello_context);
// Delete an existing blob from the given blobstore. spdk_bs_delete_blob(hello_context->bs, hello_context->blobid, delete_complete, hello_context);
// Free the I/O channel. spdk_bs_free_io_channel(hello_context->channel);
// Unload the blobstore. It will flush all volatile data to disk. spdk_bs_unload(hello_context->bs, unload_complete, hello_context);
// 其他API
// Get the io unit size in bytes. hello_context->io_unit_size = spdk_bs_get_io_unit_size(hello_context->bs);
// Get the number of free clusters. free = spdk_bs_free_cluster_count(hello_context->bs);
// Get the number of clusters allocated to the blob. total = spdk_blob_get_num_clusters(hello_context->blob);
// used_md_pages的大小 if (opts.num_md_pages == SPDK_BLOB_OPTS_NUM_MD_PAGES) { /* By default, allocate 1 page per cluster. * Technically, this over-allocates metadata * because more metadata will reduce the number * of usable clusters. This can be addressed with * more complex math in the future. */ // 一个cluster一个元数据页 bs->md_len = bs->total_clusters; } else { bs->md_len = opts.num_md_pages; } rc = spdk_bit_array_resize(&bs->used_md_pages, bs->md_len); rc = spdk_bit_array_resize(&bs->used_blobids, bs->md_len);
/* The blob id is a 64 bit number. The lower 32 bits are the page_idx. The upper * 32 bits are not currently used. Stick a 1 there just to catch bugs where the * code assumes blob id == page_idx. */ staticinline spdk_blob_id bs_page_to_blobid(uint64_t page_idx) { if (page_idx > UINT32_MAX) { return SPDK_BLOBID_INVALID; } return SPDK_BLOB_BLOBID_HIGH_BIT | page_idx; }
/* Parse the pages */ rc = blob_parse(ctx->pages, ctx->num_pages, blob);
if (blob->extent_table_found == true) { /* If EXTENT_TABLE was found, that means support for it should be enabled. */ assert(blob->extent_rle_found == false); blob->use_extent_table = true; } else { /* If EXTENT_RLE or no extent_* descriptor was found disable support * for extent table. No extent_* descriptors means that blob has length of 0 * and no extent_rle descriptors were persisted for it. * EXTENT_TABLE if used, is always present in metadata regardless of length. */ blob->use_extent_table = false; }
/* Check the clear_method stored in metadata vs what may have been passed * via spdk_bs_open_blob_ext() and update accordingly. */ blob_update_clear_method(blob);
ctx->thread = spdk_get_thread(); ctx->blob = blob; ctx->cluster_num = cluster_num; ctx->cluster = cluster; ctx->extent_page = extent_page; ctx->page = page; ctx->cb_fn = cb_fn; ctx->cb_arg = cb_arg; // Send a message to the given thread. The message will be sent asynchronously - i.e. spdk_thread_send_msg will always return prior to fn being called. spdk_thread_send_msg(blob->bs->md_thread, blob_insert_cluster_msg, ctx); }
void spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) { /* Calculate how many pages the metadata consumes at the front * of the disk. */
/* The super block uses 1 page */ num_md_pages = 1;
/* The used_md_pages mask requires 1 bit per metadata page, rounded * up to the nearest page, plus a header. */ ctx->super->used_page_mask_start = num_md_pages; ctx->super->used_page_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +spdk_divide_round_up(bs->md_len, 8),SPDK_BS_PAGE_SIZE); num_md_pages += ctx->super->used_page_mask_len;
/* The used_clusters mask requires 1 bit per cluster, rounded * up to the nearest page, plus a header. */ ctx->super->used_cluster_mask_start = num_md_pages; ctx->super->used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + spdk_divide_round_up(bs->total_clusters, 8), SPDK_BS_PAGE_SIZE); /* The blobstore might be extended, then the used_cluster bitmap will need more space. * Here we calculate the max clusters we can support according to the * num_md_pages (bs->md_len). */ max_used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + spdk_divide_round_up(bs->md_len, 8), SPDK_BS_PAGE_SIZE); max_used_cluster_mask_len = spdk_max(max_used_cluster_mask_len, ctx->super->used_cluster_mask_len); num_md_pages += max_used_cluster_mask_len;
/* The used_blobids mask requires 1 bit per metadata page, rounded * up to the nearest page, plus a header. */ ctx->super->used_blobid_mask_start = num_md_pages; ctx->super->used_blobid_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + spdk_divide_round_up(bs->md_len, 8), SPDK_BS_PAGE_SIZE); num_md_pages += ctx->super->used_blobid_mask_len;
/* The metadata region size was chosen above */ ctx->super->md_start = bs->md_start = num_md_pages; ctx->super->md_len = bs->md_len; num_md_pages += bs->md_len;
num_md_clusters = spdk_divide_round_up(num_md_pages, bs->pages_per_cluster); /* Claim all of the clusters used by the metadata */ for (i = 0; i < num_md_clusters; i++) { spdk_bit_array_set(ctx->used_clusters, i); }
structspdk_bs_super_block { uint8_t signature[8]; uint32_t version; uint32_t length; uint32_t clean; /* If there was a clean shutdown, this is 1. */ spdk_blob_id super_blob;
uint32_t cluster_size; /* In bytes */
uint32_t used_page_mask_start; /* Offset from beginning of disk, in pages */ uint32_t used_page_mask_len; /* Count, in pages */
uint32_t used_cluster_mask_start; /* Offset from beginning of disk, in pages */ uint32_t used_cluster_mask_len; /* Count, in pages */
uint32_t md_start; /* Offset from beginning of disk, in pages */ uint32_t md_len; /* Count, in pages */
structspdk_bs_typebstype;/* blobstore type */
uint32_t used_blobid_mask_start; /* Offset from beginning of disk, in pages */ uint32_t used_blobid_mask_len; /* Count, in pages */
uint64_t size; /* size of blobstore in bytes */ uint32_t io_unit_size; /* Size of io unit in bytes */
之所以长度需加上sizeof(struct spdk_bs_md_mask) /* The used_md_pages mask requires 1 bit per metadata page, rounded * up to the nearest page, plus a header. */ ctx->super->used_page_mask_start = num_md_pages; ctx->super->used_page_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + spdk_divide_round_up(bs->md_len, 8), SPDK_BS_PAGE_SIZE); num_md_pages += ctx->super->used_page_mask_len;
/* A generic request set. Can be a sequence, batch or a user_op. */ structspdk_bs_request_set { structspdk_bs_cplcpl;
int bserrno;
/* * The blobstore's channel, obtained by blobstore consumers via * spdk_bs_alloc_io_channel(). Used for IO to the blobstore. */ structspdk_bs_channel *channel; /* * The channel used by the blobstore to perform IO on back_bs_dev. Unless the blob * is an esnap clone, back_channel == spdk_io_channel_get_ctx(set->channel). */ structspdk_io_channel *back_channel;
// 使用gdb显示bs_init_complete调用栈 Thread 1"reactor_0" hit Breakpoint 4, bs_init_complete (cb_arg=0xf19affa87bdd2000, bs=0x7fffffffd5a0, bserrno=8192) at hello_blob.c:287 287staticvoidbs_init_complete(void *cb_arg, struct spdk_blob_store *bs, int bserrno) { (gdb) bt #0 bs_init_complete (cb_arg=0xf19affa87bdd2000, bs=0x7fffffffd5a0, bserrno=8192) at hello_blob.c:287 #10x00005555555d6bb0 in bs_call_cpl(cpl=0x7fffffffd5e0, bserrno=0) at request.c:26 #2 0x00005555555d6d0d in bs_request_set_complete(set=0x555555d48c20) at request.c:63 #3 0x00005555555d77ec in bs_sequence_finish(seq=0x555555d48c20, bserrno=0) at request.c:295 #4 0x00005555555cd558 in bs_init_persist_super_cpl(seq=0x555555d48c20, cb_arg=0x555555ecfa20, bserrno=0) at blobstore.c:5217 #5 0x00005555555d6d5e in bs_sequence_completion(channel=0x555555cd4480, cb_arg=0x555555d48c20, bserrno=0) at request.c:72 #6 0x00005555555b3b8e in bdev_blob_io_complete(bdev_io=0x200013aa2700, success=true, arg=0x555555d48c60) at blob_bdev.c:64 #7 0x000055555568e23c in _bdev_io_complete(ctx=0x200013aa2700) at bdev.c:6970 #8 0x000055555568e3dc in bdev_io_complete(ctx=0x200013aa2700) at bdev.c:7003 #9 0x000055555568e900 in spdk_bdev_io_complete(bdev_io=0x200013aa2700, status=SPDK_BDEV_IO_STATUS_SUCCESS) at bdev.c:7131 #10 0x0000555555570c6c in malloc_done(ref=0x200013aa2ae0, status=0) at bdev_malloc.c:130 #11 0x0000555555570f05 in malloc_sequence_done(ctx=0x200013aa2ae0, status=0) at bdev_malloc.c:225 #12 0x000055555569df8b in accel_sequence_complete(seq=0x555555e1dc40) at accel.c:1250 #13 0x000055555569f43c in accel_process_sequence(seq=0x555555e1dc40) at accel.c:1675 #14 0x000055555569f749 in accel_sequence_task_cb(cb_arg=0x555555e1dc40, status=0) at accel.c:1750 #15 0x000055555569b67d in spdk_accel_task_complete(accel_task=0x555555d5ddb0, status=0) at accel.c:292 #16 0x00005555556a4d66 in accel_comp_poll(arg=0x555555e75cc0) at accel_sw.c:525 #17 0x00005555556b6511 in thread_execute_poller(thread=0x555555ce17e0, poller=0x555555cd7b10) at thread.c:946 #18 0x00005555556b6a95 in thread_poll(thread=0x555555ce17e0, max_msgs=0, now=3835790851557406) at thread.c:1072 #19 0x00005555556b6d44 in spdk_thread_poll(thread=0x555555ce17e0, max_msgs=0, now=3835790851557406) at thread.c:1156 #20 0x0000555555679572 in _reactor_run(reactor=0x555555ce14c0) at reactor.c:914 #21 0x0000555555679664 in reactor_run(arg=0x555555ce14c0) at reactor.c:952 #22 0x0000555555679aeb in spdk_reactors_start() at reactor.c:1068 #23 0x0000555555675c11 in spdk_app_start(opts_user=0x7fffffffde70, start_fn=0x5555555705cc <hello_start>, arg1=0x555555cce4c0) at app.c:827 --Type <RET> for more, q to quit, c to continue without paging-- #24 0x00005555555707a7 in main(argc=2, argv=0x7fffffffe058) at hello_blob.c:390