spdk学习记录

往期文章:
spdk环境搭建

hello_bdev

代码路径:examples/bdev/hello_world/hello_bdev.c
可执行文件路径:build/examples/hello_bdev

刚开始直接执行hello_bdev显示找不到Malloc0

1
2
3
4
5
6
7
8
9
10
11
12
13
./build/examples/hello_bdev
[2023-05-30 20:27:02.389489] Starting SPDK v23.05-pre git sha1 ee06693c3 / DPDK 22.11.1 initialization...
[2023-05-30 20:27:02.390910] [ DPDK EAL parameters: hello_bdev --no-shconf -c 0x1 --huge-unlink --log-level=lib.eal:6 --log-level=lib.cryptodev:5 --log-level=user1:6 --iova-mode=pa --base-virtaddr=0x200000000000 --match-allocations --file-prefix=spdk_pid11584 ]
TELEMETRY: No legacy callbacks, legacy socket not created
[2023-05-30 20:27:02.511380] app.c: 738:spdk_app_start: *NOTICE*: Total cores available: 1
[2023-05-30 20:27:02.561201] reactor.c: 937:reactor_run: *NOTICE*: Reactor started on core 0
[2023-05-30 20:27:02.600284] accel_sw.c: 601:sw_accel_module_init: *NOTICE*: Accel framework software module initialized.
[2023-05-30 20:27:02.621229] hello_bdev.c: 222:hello_start: *NOTICE*: Successfully started the application
[2023-05-30 20:27:02.621612] hello_bdev.c: 231:hello_start: *NOTICE*: Opening the bdev Malloc0
[2023-05-30 20:27:02.621691] bdev.c:7681:spdk_bdev_open_ext: *NOTICE*: Currently unable to find bdev with name: Malloc0
[2023-05-30 20:27:02.621761] hello_bdev.c: 235:hello_start: *ERROR*: Could not open bdev: Malloc0
[2023-05-30 20:27:02.621852] app.c: 844:spdk_app_stop: *WARNING*: spdk_app_stop'd on non-zero
[2023-05-30 20:27:02.691191] hello_bdev.c: 308:main: *ERROR*: ERROR starting application

在网上找到了相应issue,https://github.com/spdk/spdk/issues/1550

正确的执行方式为:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
./build/examples/hello_bdev -c ./examples/bdev/hello_world/bdev.json -b Malloc0


[2023-05-30 20:25:59.131197] Starting SPDK v23.05-pre git sha1 ee06693c3 / DPDK 22.11.1 initialization...
[2023-05-30 20:25:59.132037] [ DPDK EAL parameters: hello_bdev --no-shconf -c 0x1 --huge-unlink --log-level=lib.eal:6 --log-level=lib.cryptodev:5 --log-level=user1:6 --iova-mode=pa --base-virtaddr=0x200000000000 --match-allocations --file-prefix=spdk_pid11462 ]
TELEMETRY: No legacy callbacks, legacy socket not created
[2023-05-30 20:25:59.252268] app.c: 738:spdk_app_start: *NOTICE*: Total cores available: 1
[2023-05-30 20:25:59.303646] reactor.c: 937:reactor_run: *NOTICE*: Reactor started on core 0
[2023-05-30 20:25:59.359161] accel_sw.c: 601:sw_accel_module_init: *NOTICE*: Accel framework software module initialized.
[2023-05-30 20:25:59.387635] hello_bdev.c: 222:hello_start: *NOTICE*: Successfully started the application
[2023-05-30 20:25:59.388053] hello_bdev.c: 231:hello_start: *NOTICE*: Opening the bdev Malloc0
[2023-05-30 20:25:59.388153] hello_bdev.c: 244:hello_start: *NOTICE*: Opening io channel
[2023-05-30 20:25:59.388529] hello_bdev.c: 138:hello_write: *NOTICE*: Writing to the bdev
[2023-05-30 20:25:59.388757] hello_bdev.c: 117:write_complete: *NOTICE*: bdev io write completed successfully
[2023-05-30 20:25:59.388931] hello_bdev.c: 84:hello_read: *NOTICE*: Reading io
[2023-05-30 20:25:59.389019] hello_bdev.c: 65:read_complete: *NOTICE*: Read string from bdev : Hello World!

[2023-05-30 20:25:59.389128] hello_bdev.c: 74:read_complete: *NOTICE*: Stopping app

命令行参数

-b参数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
static char *g_bdev_name = "Malloc0";
/*
* Usage function for printing parameters that are specific to this application
*/
static void
hello_bdev_usage(void)
{
printf(" -b <bdev> name of the bdev to use\n");
}

/*
* This function is called to parse the parameters that are specific to this application
*/
static int
hello_bdev_parse_arg(int ch, char *arg)
{
switch (ch) {
case 'b':
g_bdev_name = arg;
break;
default:
return -EINVAL;
}
return 0;
}
spdk_app_parse_args(argc, argv, &opts, "b:", NULL, hello_bdev_parse_arg, hello_bdev_usage)
hello_context.bdev_name = g_bdev_name;

可以看出,g_bdev_name本来就是Malloc0,-b Malloc0没啥用

-c参数

1
2
3
4
5
6
7
static void
usage(void (*app_usage)(void))
{
printf("%s [options]\n", g_executable_name);
printf("options:\n");
printf(" -c, --config <config> JSON config file (default %s)\n",
g_default_opts.json_config_file != NULL ? g_default_opts.json_config_file : "none");

-c后加json配置文件名,bdev.json文件内容如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
{
"subsystems": [
{
"subsystem": "bdev",
"config": [
{
"method": "bdev_malloc_create",
"params": {
"name": "Malloc0",
"num_blocks": 32768,
"block_size": 512
}
}
]
}
]
}

简要看json的解析过程,全局查询json_config_file,找到spdk_subsystem_init_from_json_config函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
void
spdk_subsystem_init_from_json_config(const char *json_config_file, const char *rpc_addr,
spdk_subsystem_init_fn cb_fn, void *cb_arg,
bool stop_on_error)
{
struct load_json_config_ctx *ctx = calloc(1, sizeof(*ctx));
int rc;

assert(cb_fn);
if (!ctx) {
cb_fn(-ENOMEM, cb_arg);
return;
}

ctx->cb_fn = cb_fn;
ctx->cb_arg = cb_arg;
ctx->stop_on_error = stop_on_error;
ctx->thread = spdk_get_thread();

rc = app_json_config_read(json_config_file, ctx);
if (rc) {
goto fail;
}

/* Capture subsystems array */
rc = spdk_json_find_array(ctx->values, "subsystems", NULL, &ctx->subsystems);
switch (rc) {
case 0:
/* Get first subsystem */
ctx->subsystems_it = spdk_json_array_first(ctx->subsystems);
if (ctx->subsystems_it == NULL) {
SPDK_NOTICELOG("'subsystems' configuration is empty\n");
}
break;
case -EPROTOTYPE:
SPDK_ERRLOG("Invalid JSON configuration: not enclosed in {}.\n");
goto fail;
case -ENOENT:
SPDK_WARNLOG("No 'subsystems' key JSON configuration file.\n");
break;
case -EDOM:
SPDK_ERRLOG("Invalid JSON configuration: 'subsystems' should be an array.\n");
goto fail;
default:
SPDK_ERRLOG("Failed to parse JSON configuration.\n");
goto fail;
}

/* If rpc_addr is not an Unix socket use default address as prefix. */
if (rpc_addr == NULL || rpc_addr[0] != '/') {
rpc_addr = SPDK_DEFAULT_RPC_ADDR;
}

/* FIXME: rpc client should use socketpair() instead of this temporary socket nonsense */
rc = snprintf(ctx->rpc_socket_path_temp, sizeof(ctx->rpc_socket_path_temp), "%s.%d_config",
rpc_addr, getpid());
if (rc >= (int)sizeof(ctx->rpc_socket_path_temp)) {
SPDK_ERRLOG("Socket name create failed\n");
goto fail;
}

rc = spdk_rpc_initialize(ctx->rpc_socket_path_temp);
if (rc) {
goto fail;
}

ctx->client_conn = spdk_jsonrpc_client_connect(ctx->rpc_socket_path_temp, AF_UNIX);
if (ctx->client_conn == NULL) {
SPDK_ERRLOG("Failed to connect to '%s'\n", ctx->rpc_socket_path_temp);
goto fail;
}

rpc_client_set_timeout(ctx, RPC_CLIENT_CONNECT_TIMEOUT_US);
ctx->client_conn_poller = SPDK_POLLER_REGISTER(rpc_client_connect_poller, ctx, 100);
return;

fail:
app_json_config_load_done(ctx, -EINVAL);
}

全局查询bdev_malloc_create,找到rpc_bdev_malloc_create函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
static void
rpc_bdev_malloc_create(struct spdk_jsonrpc_request *request,
const struct spdk_json_val *params)
{
struct malloc_bdev_opts req = {NULL};
struct spdk_json_write_ctx *w;
struct spdk_bdev *bdev;
int rc = 0;

if (spdk_json_decode_object(params, rpc_construct_malloc_decoders,
SPDK_COUNTOF(rpc_construct_malloc_decoders),
&req)) {
SPDK_DEBUGLOG(bdev_malloc, "spdk_json_decode_object failed\n");
spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
"spdk_json_decode_object failed");
goto cleanup;
}

rc = create_malloc_disk(&bdev, &req);
if (rc) {
spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
goto cleanup;
}

free_rpc_construct_malloc(&req);

w = spdk_jsonrpc_begin_result(request);
spdk_json_write_string(w, spdk_bdev_get_name(bdev));
spdk_jsonrpc_end_result(request, w);
return;

cleanup:
free_rpc_construct_malloc(&req);
}
SPDK_RPC_REGISTER("bdev_malloc_create", rpc_bdev_malloc_create, SPDK_RPC_RUNTIME)

运行到该函数的回溯栈为

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
(gdb) bt
#0 rpc_bdev_malloc_create (request=0xcac1f474c379e400, params=0x555555cc9570) at bdev_malloc_rpc.c:49
#1 0x00005555556b0a53 in jsonrpc_handler (request=0x555555cc04e0, method=0x555555c648e0, params=0x555555c64900) at rpc.c:124
#2 0x00005555556b2c5e in jsonrpc_server_handle_request (request=0x555555cc04e0, method=0x555555c648e0, params=0x555555c64900) at jsonrpc_server_tcp.c:222
#3 0x00005555556b1665 in parse_single_request (request=0x555555cc04e0, values=0x555555c64880) at jsonrpc_server.c:75
#4 0x00005555556b1c68 in jsonrpc_parse_request (conn=0x7ffff5f7e040, json=0x7ffff5f7e058, size=172) at jsonrpc_server.c:205
#5 0x00005555556b2eaa in jsonrpc_server_conn_recv (conn=0x7ffff5f7e040) at jsonrpc_server_tcp.c:284
#6 0x00005555556b3297 in spdk_jsonrpc_server_poll (server=0x7ffff5f7e010) at jsonrpc_server_tcp.c:402
#7 0x00005555556b0d59 in spdk_rpc_accept () at rpc.c:213
#8 0x00005555556a13c4 in rpc_subsystem_poll (arg=0x0) at rpc.c:21
#9 0x00005555556a82fd in thread_execute_timed_poller (thread=0x555555c9ec00, poller=0x555555cbf2c0, now=41542509569737) at thread.c:970
#10 0x00005555556a8613 in thread_poll (thread=0x555555c9ec00, max_msgs=0, now=41542509569737) at thread.c:1060
#11 0x00005555556a8837 in spdk_thread_poll (thread=0x555555c9ec00, max_msgs=0, now=41542509569737) at thread.c:1119
#12 0x000055555566d309 in _reactor_run (reactor=0x555555c7b780) at reactor.c:914
#13 0x000055555566d3fb in reactor_run (arg=0x555555c7b780) at reactor.c:952
#14 0x000055555566d887 in spdk_reactors_start () at reactor.c:1068
#15 0x0000555555669c5d in spdk_app_start (opts_user=0x7fffffffdea0, start_fn=0x55555556e1fb <hello_start>, arg1=0x7fffffffde40) at app.c:779
#16 0x000055555556e5d9 in main (argc=5, argv=0x7fffffffe078) at hello_bdev.c:306

p req
$19 = {name = 0x555555cc9580 "Malloc0", uuid = {u = {raw = '\000' <repeats 15 times>}}, num_blocks = 32768, block_size = 512, physical_block_size = 0, optimal_io_boundary = 0, md_size = 0,
md_interleave = false, dif_type = SPDK_DIF_DISABLE, dif_is_head_of_md = false}

猜测rpc_bdev_malloc_create函数与spdk_subsystem_init_from_json_config中的
SPDK_POLLER_REGISTER(rpc_client_connect_poller, ctx, 100);有关。

有兴趣的可以继续研究rpc_bdev_malloc_create函数中的create_malloc_disk函数

示例函数

hello_bdev.c中的代码逻辑比较简单,执行顺序为

1
2
3
4
5
6
spdk_app_start
hello_start
hello_write
write_complete
hello_read
read_complete

与spdk/examples/nvme/hello_world/hello_world.c中做的事情类似

hello_start函数首先通过spdk_bdev_open_ext得到文件描述符,而后获取bdev设备,IO通道,申请缓冲区,写入”Hello World!\n”,调用spdk_bdev_write将缓冲区数据写入Malloc0设备,偏移量为0,写完成后重置缓冲区数据,调用spdk_bdev_read读取相同位置数据,读完成后打印返回数据,释放之前申请的IO通道,块设备描述符。

简要分析hello_write的调用过程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
// 函数调用栈
(gdb) bt
#0 _sw_accel_copy_iovs (dst_iovs=0x555555cca0b8, dst_iovcnt=1, src_iovs=0x555555cca0a8, src_iovcnt=1) at accel_sw.c:115
#1 0x0000555555696577 in sw_accel_submit_tasks (ch=0x555555dadfd0, accel_task=0x555555cc9fb0) at accel_sw.c:455
#2 0x000055555568e5a2 in accel_submit_task (accel_ch=0x555555e51190, task=0x555555cc9fb0) at accel.c:305
#3 0x000055555568e723 in spdk_accel_submit_copy (ch=0x555555e51130, dst=0x200016600000, src=0x2000162efd00, nbytes=512, flags=0, cb_fn=0x55555556e83f <malloc_done>, cb_arg=0x200010aa2ae0) at accel.c:340
#4 0x000055555556eec4 in bdev_malloc_writev (mdisk=0x555555cc95c0, ch=0x555555e51130, task=0x200010aa2ae0, iov=0x200010aa2710, iovcnt=1, len=512, offset=0, md_buf=0x0, md_len=0, md_offset=0) at bdev_malloc.c:277
#5 0x000055555556f43b in _bdev_malloc_submit_request (mch=0x555555e50e60, bdev_io=0x200010aa2700) at bdev_malloc.c:382
#6 0x000055555556f69c in bdev_malloc_submit_request (ch=0x555555e50e00, bdev_io=0x200010aa2700) at bdev_malloc.c:457
#7 0x0000555555674c66 in bdev_submit_request (bdev=0x555555cc95c0, ioch=0x555555e50e00, bdev_io=0x200010aa2700) at bdev.c:1297
#8 0x000055555567784d in bdev_io_do_submit (bdev_ch=0x555555e50d50, bdev_io=0x200010aa2700) at bdev.c:2477
#9 0x000055555567947a in _bdev_io_submit (ctx=0x200010aa2700) at bdev.c:3173
#10 0x0000555555679a48 in bdev_io_submit (bdev_io=0x200010aa2700) at bdev.c:3293
#11 0x000055555567e0f7 in bdev_write_blocks_with_md (desc=0x555555e50b60, ch=0x555555e50cf0, buf=0x2000162efd00, md_buf=0x0, offset_blocks=0, num_blocks=1, cb=0x55555556dd5e <write_complete>, cb_arg=0x7fffffffde40) at bdev.c:5195
#12 0x000055555567e1df in spdk_bdev_write_blocks (desc=0x555555e50b60, ch=0x555555e50cf0, buf=0x2000162efd00, offset_blocks=0, num_blocks=1, cb=0x55555556dd5e <write_complete>, cb_arg=0x7fffffffde40) at bdev.c:5219
#13 0x000055555567e188 in spdk_bdev_write (desc=0x555555e50b60, ch=0x555555e50cf0, buf=0x2000162efd00, offset=0, nbytes=512, cb=0x55555556dd5e <write_complete>, cb_arg=0x7fffffffde40) at bdev.c:5211
#14 0x000055555556decc in hello_write (arg=0x7fffffffde40) at hello_bdev.c:139
#15 0x000055555556e4d3 in hello_start (arg1=0x7fffffffde40) at hello_bdev.c:276
#16 0x00005555556683f7 in app_start_application () at app.c:264
#17 0x0000555555668478 in app_start_rpc (rc=0, arg1=0x0) at app.c:285
#18 0x000055555569f259 in app_json_config_load_done (ctx=0x555555c9f000, rc=0) at json_config.c:111
#19 0x000055555569ffa6 in app_json_config_load_subsystem (_ctx=0x555555c9f000) at json_config.c:473
#20 0x00005555556a7bd0 in msg_queue_run_batch (thread=0x555555c9ec00, max_msgs=8) at thread.c:804
#21 0x00005555556a8528 in thread_poll (thread=0x555555c9ec00, max_msgs=0, now=121496004745246) at thread.c:1026
#22 0x00005555556a8837 in spdk_thread_poll (thread=0x555555c9ec00, max_msgs=0, now=121496004745246) at thread.c:1119
#23 0x000055555566d309 in _reactor_run (reactor=0x555555c7b780) at reactor.c:914
#24 0x000055555566d3fb in reactor_run (arg=0x555555c7b780) at reactor.c:952
#25 0x000055555566d887 in spdk_reactors_start () at reactor.c:1068
#26 0x0000555555669c5d in spdk_app_start (opts_user=0x7fffffffdea0, start_fn=0x55555556e1fb <hello_start>, arg1=0x7fffffffde40) at app.c:779
#27 0x000055555556e5d9 in main (argc=5, argv=0x7fffffffe078) at hello_bdev.c:306

追溯到最后,就是使用memcpy拷贝数据,那么src与dst分别是什么呢

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
static void
_sw_accel_copy_iovs(struct iovec *dst_iovs, uint32_t dst_iovcnt,
struct iovec *src_iovs, uint32_t src_iovcnt)
{
struct spdk_ioviter iter;
void *src, *dst;
size_t len;

for (len = spdk_ioviter_first(&iter, src_iovs, src_iovcnt,
dst_iovs, dst_iovcnt, &src, &dst);
len != 0;
len = spdk_ioviter_next(&iter, &src, &dst)) {
memcpy(dst, src, len);
}
}

src为hello_context->buff,dst为mdisk->malloc_buf + offset,故在Malloc bdev中写入数据只是简单地将数据拷贝到bdev相应的缓冲区,没看到sq cq之类的操作。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
(gdb) p src
$11 = (void *) 0x2000162efd00
(gdb) p dst
$12 = (void *) 0x200016600000
(gdb) p len
$13 = 512
(gdb) f 4
#4 0x000055555556eec4 in bdev_malloc_writev (mdisk=0x555555cc95c0, ch=0x555555e51130, task=0x200010aa2ae0, iov=0x200010aa2710, iovcnt=1, len=512, offset=0, md_buf=0x0, md_len=0, md_offset=0) at bdev_malloc.c:277
277 res = spdk_accel_submit_copy(ch, dst, iov[i].iov_base,
(gdb) p mdisk->malloc_buf + offset
$14 = (void *) 0x200016600000
(gdb) f 13
#13 0x000055555567e188 in spdk_bdev_write (desc=0x555555e50b60, ch=0x555555e50cf0, buf=0x2000162efd00, offset=0, nbytes=512, cb=0x55555556dd5e <write_complete>, cb_arg=0x7fffffffde40) at bdev.c:5211
5211 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
(gdb) p buf
$15 = (void *) 0x2000162efd00

在调用栈需要特别关注的就是bdev_write_blocks_with_md函数,在这个函数中创建了spdk_bdev_io结构体,当一个IO请求完成,都会调用spdk_bdev_free_io释放对应空间

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
static int
bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
spdk_bdev_io_completion_cb cb, void *cb_arg)
{
struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
struct spdk_bdev_io *bdev_io;
struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);

if (!desc->write) {
return -EBADF;
}

if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
return -EINVAL;
}

bdev_io = bdev_channel_get_io(channel);
if (!bdev_io) {
return -ENOMEM;
}
// 设置IO请求信息
bdev_io->internal.ch = channel;
bdev_io->internal.desc = desc;
bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
bdev_io->u.bdev.iovs = &bdev_io->iov;
bdev_io->u.bdev.iovs[0].iov_base = buf;
bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
bdev_io->u.bdev.iovcnt = 1;
bdev_io->u.bdev.md_buf = md_buf;
bdev_io->u.bdev.num_blocks = num_blocks;
bdev_io->u.bdev.offset_blocks = offset_blocks;
bdev_io->u.bdev.memory_domain = NULL;
bdev_io->u.bdev.memory_domain_ctx = NULL;
bdev_io->u.bdev.accel_sequence = NULL;
bdev_io_init(bdev_io, bdev, cb_arg, cb); // 设置回调函数

bdev_io_submit(bdev_io);
return 0;
}

函数调用中有几次都通过函数指针跳转,最关键的即为bdev_submit_request->bdev_malloc_submit_reques

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
static inline void
bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch,
struct spdk_bdev_io *bdev_io)
{
/* After a request is submitted to a bdev module, the ownership of an accel sequence
* associated with that bdev_io is transferred to the bdev module. So, clear the internal
* sequence pointer to make sure we won't touch it anymore. */
if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE ||
bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) {
assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io));
bdev_io->internal.accel_sequence = NULL;
}

bdev->fn_table->submit_request(ioch, bdev_io);
}

其中spdk_bdev_fn_table结构体定义为

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
/**
* Function table for a block device backend.
*
* The backend block device function table provides a set of APIs to allow
* communication with a backend. The main commands are read/write API
* calls for I/O via submit_request.
*/
struct spdk_bdev_fn_table {
/** Destroy the backend block device object */
int (*destruct)(void *ctx);

/** Process the IO. */
void (*submit_request)(struct spdk_io_channel *ch, struct spdk_bdev_io *);

/** Check if the block device supports a specific I/O type. */
bool (*io_type_supported)(void *ctx, enum spdk_bdev_io_type);

/** Get an I/O channel for the specific bdev for the calling thread. */
struct spdk_io_channel *(*get_io_channel)(void *ctx);

/**
* Output driver-specific information to a JSON stream. Optional - may be NULL.
*
* The JSON write context will be initialized with an open object, so the bdev
* driver should write a name (based on the driver name) followed by a JSON value
* (most likely another nested object).
*/
int (*dump_info_json)(void *ctx, struct spdk_json_write_ctx *w);

/**
* Output bdev-specific RPC configuration to a JSON stream. Optional - may be NULL.
*
* This function should only be implemented for bdevs which can be configured
* independently of other bdevs. For example, RPCs to create a bdev for an NVMe
* namespace may not be generated by this function, since enumerating an NVMe
* namespace requires attaching to an NVMe controller, and that controller may
* contain multiple namespaces. The spdk_bdev_module's config_json function should
* be used instead for these cases.
*
* The JSON write context will be initialized with an open object, so the bdev
* driver should write all data necessary to recreate this bdev by invoking
* constructor method. No other data should be written.
*/
void (*write_config_json)(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w);

/** Get spin-time per I/O channel in microseconds.
* Optional - may be NULL.
*/
uint64_t (*get_spin_time)(struct spdk_io_channel *ch);

/** Get bdev module context. */
void *(*get_module_ctx)(void *ctx);

/** Get memory domains used by bdev. Optional - may be NULL.
* Vbdev module implementation should call \ref spdk_bdev_get_memory_domains for underlying bdev.
* Vbdev module must inspect types of memory domains returned by base bdev and report only those
* memory domains that it can work with. */
int (*get_memory_domains)(void *ctx, struct spdk_memory_domain **domains, int array_size);

/**
* Reset I/O statistics specific for this bdev context.
*/
void (*reset_device_stat)(void *ctx);

/**
* Dump I/O statistics specific for this bdev context.
*/
void (*dump_device_stat_json)(void *ctx, struct spdk_json_write_ctx *w);

/** Check if bdev can handle spdk_accel_sequence to handle I/O of specific type. */
bool (*accel_sequence_supported)(void *ctx, enum spdk_bdev_io_type type);
};

在命令行参数解析时的rpc_bdev_malloc_create函数中调用了create_malloc_disk,在该函数中设置了相关信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
struct malloc_disk {
struct spdk_bdev disk;
void *malloc_buf;
void *malloc_md_buf;
TAILQ_ENTRY(malloc_disk) link;
};

static const struct spdk_bdev_fn_table malloc_fn_table = {
.destruct = bdev_malloc_destruct,
.submit_request = bdev_malloc_submit_request,
.io_type_supported = bdev_malloc_io_type_supported,
.get_io_channel = bdev_malloc_get_io_channel,
.write_config_json = bdev_malloc_write_json_config,
};

static struct spdk_bdev_module malloc_if = {
.name = "malloc",
.module_init = bdev_malloc_initialize,
.module_fini = bdev_malloc_deinitialize,
.get_ctx_size = bdev_malloc_get_ctx_size,

};

int
create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts)
/*
* Allocate the large backend memory buffer from pinned memory.
*
* TODO: need to pass a hint so we know which socket to allocate
* from on multi-socket systems.
*/
mdisk->malloc_buf = spdk_zmalloc(opts->num_blocks * block_size, 2 * 1024 * 1024, NULL,
SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
mdisk->disk.max_copy = 0;
mdisk->disk.ctxt = mdisk;
mdisk->disk.fn_table = &malloc_fn_table;
mdisk->disk.module = &malloc_if;

rc = spdk_bdev_register(&mdisk->disk);
TAILQ_INSERT_TAIL(&g_malloc_disks, mdisk, link);
}

spdk文档中有关于自定义块设备的介绍 Writing a Custom Block Device Module

spdk bdev的用户指南:Block Device User Guide


malloc bdev设备申请的malloc_buf没看见有持久化操作,故malloc bdev数据只存在于内存之中

nvme bdev

刚开始一直不知道怎么生成类似于Malloc dev的json配置文件,在集成rocksdb的时候看到了gen_nvme.sh脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
scripts/gen_nvme.sh --json-with-subsystems
# 内容
{
"subsystems": [
{
"subsystem": "bdev",
"config": [
{
"method": "bdev_nvme_attach_controller",
"params": {
"trtype": "PCIe",
"name": "Nvme0",
"traddr": "0000:03:00.0"
}
},
{
"method": "bdev_nvme_attach_controller",
"params": {
"trtype": "PCIe",
"name": "Nvme1",
"traddr": "0000:0b:00.0"
}
},
{
"method": "bdev_nvme_attach_controller",
"params": {
"trtype": "PCIe",
"name": "Nvme2",
"traddr": "0000:13:00.0"
}
}
]
}
]
}

照猫画虎执行以下命令却执行失败

1
2
3
4
5
6
7
8
9
10
11
12
13
./build/examples/hello_bdev -c ./examples/bdev/hello_world/nvme.json -b Nvme0
[2023-06-23 10:19:01.096287] Starting SPDK v23.05-pre git sha1 ee06693c3 / DPDK 22.11.1 initialization...
[2023-06-23 10:19:01.096442] [ DPDK EAL parameters: hello_bdev --no-shconf -c 0x1 --huge-unlink --log-level=lib.eal:6 --log-level=lib.cryptodev:5 --log-level=user1:6 --iova-mode=pa --base-virtaddr=0x200000000000 --match-allocations --file-prefix=spdk_pid6092 ]
TELEMETRY: No legacy callbacks, legacy socket not created
[2023-06-23 10:19:01.215133] app.c: 738:spdk_app_start: *NOTICE*: Total cores available: 1
[2023-06-23 10:19:01.267987] reactor.c: 937:reactor_run: *NOTICE*: Reactor started on core 0
[2023-06-23 10:19:01.311606] accel_sw.c: 601:sw_accel_module_init: *NOTICE*: Accel framework software module initialized.
[2023-06-23 10:19:01.476591] hello_bdev.c: 222:hello_start: *NOTICE*: Successfully started the application
[2023-06-23 10:19:01.476916] hello_bdev.c: 231:hello_start: *NOTICE*: Opening the bdev Nvme0
[2023-06-23 10:19:01.477001] bdev.c:7681:spdk_bdev_open_ext: *NOTICE*: Currently unable to find bdev with name: Nvme0
[2023-06-23 10:19:01.477110] hello_bdev.c: 235:hello_start: *ERROR*: Could not open bdev: Nvme0
[2023-06-23 10:19:01.477180] app.c: 844:spdk_app_stop: *WARNING*: spdk_app_stop'd on non-zero
[2023-06-23 10:19:01.553524] hello_bdev.c: 308:main: *ERROR*: ERROR starting application

查看rpc_bdev_nvme_attach_controller函数出现的名字确实是Nvme0,全局搜索-b Nvme0

spdk/doc/bdev.md
There are two ways to create block device based on NVMe device in SPDK. First way is to connect local PCIe drive and second one is to connect NVMe-oF device. In both cases user should use bdev_nvme_attach_controller RPC command to achieve that.

Example commands
rpc.py bdev_nvme_attach_controller -b NVMe1 -t PCIe -a 0000:01:00.0
This command will create NVMe bdev of physical device in the system.
rpc.py bdev_nvme_attach_controller -b Nvme0 -t RDMA -a 192.168.100.1 -f IPv4 -s 4420 -n nqn.2016-06.io.spdk:cnode1
This command will create NVMe bdev of NVMe-oF resource.

To remove an NVMe controller use the bdev_nvme_detach_controller command.
rpc.py bdev_nvme_detach_controller Nvme0
This command will remove NVMe bdev named Nvme0.

spdk/scripts/vagrant/README.md

1
2
3
$ sudo scripts/setup.sh
$ sudo scripts/gen_nvme.sh --json-with-subsystems > ./build/examples/hello_bdev.json
$ sudo ./build/examples/hello_bdev --json ./build/examples/hello_bdev.json -b Nvme0n1

故使用Nvme0n1试一试,发现成功

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
./build/examples/hello_bdev -c examples/bdev/hello_world/nvme.json -b Nvme0n1
[2023-06-23 11:15:10.141844] Starting SPDK v23.05-pre git sha1 ee06693c3 / DPDK 22.11.1 initialization...
[2023-06-23 11:15:10.141985] [ DPDK EAL parameters: hello_bdev --no-shconf -c 0x1 --huge-unlink --log-level=lib.eal:6 --log-level=lib.cryptodev:5 --log-level=user1:6 --iova-mode=pa --base-virtaddr=0x200000000000 --match-allocations --file-prefix=spdk_pid12910 ]
TELEMETRY: No legacy callbacks, legacy socket not created
[2023-06-23 11:15:10.265816] app.c: 738:spdk_app_start: *NOTICE*: Total cores available: 1
[2023-06-23 11:15:10.320068] reactor.c: 937:reactor_run: *NOTICE*: Reactor started on core 0
[2023-06-23 11:15:10.373112] accel_sw.c: 601:sw_accel_module_init: *NOTICE*: Accel framework software module initialized.
[2023-06-23 11:15:10.542735] hello_bdev.c: 222:hello_start: *NOTICE*: Successfully started the application
[2023-06-23 11:15:10.543367] hello_bdev.c: 231:hello_start: *NOTICE*: Opening the bdev Nvme0n1
[2023-06-23 11:15:10.543648] hello_bdev.c: 244:hello_start: *NOTICE*: Opening io channel
[2023-06-23 11:15:10.544289] hello_bdev.c: 138:hello_write: *NOTICE*: Writing to the bdev
[2023-06-23 11:15:10.545329] hello_bdev.c: 117:write_complete: *NOTICE*: bdev io write completed successfully
[2023-06-23 11:15:10.546495] hello_bdev.c: 84:hello_read: *NOTICE*: Reading io
[2023-06-23 11:15:10.546975] hello_bdev.c: 65:read_complete: *NOTICE*: Read string from bdev : Hello World!

[2023-06-23 11:15:10.548061] hello_bdev.c: 74:read_complete: *NOTICE*: Stopping app

简要看一下nvme bdev的一些不同,直接到分叉点bdev_submit_request函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
Thread 1 "reactor_0" hit Breakpoint 1, bdev_submit_request (bdev=0x555555cd8350, ioch=0x555555cca200, bdev_io=0x200010aa2700) at bdev.c:1291
1291 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE ||
(gdb) bt
#0 bdev_submit_request (bdev=0x555555cd8350, ioch=0x555555cca200, bdev_io=0x200010aa2700) at bdev.c:1291
#1 0x000055555567784d in bdev_io_do_submit (bdev_ch=0x555555cca150, bdev_io=0x200010aa2700) at bdev.c:2477
#2 0x000055555567947a in _bdev_io_submit (ctx=0x200010aa2700) at bdev.c:3173
#3 0x0000555555679a48 in bdev_io_submit (bdev_io=0x200010aa2700) at bdev.c:3293
#4 0x000055555567e0f7 in bdev_write_blocks_with_md (desc=0x555555e600d0, ch=0x555555cca0f0, buf=0x200003aeb340, md_buf=0x0, offset_blocks=0, num_blocks=1, cb=0x55555556dd5e <write_complete>, cb_arg=0x7fffffffde40) at bdev.c:5195
#5 0x000055555567e1df in spdk_bdev_write_blocks (desc=0x555555e600d0, ch=0x555555cca0f0, buf=0x200003aeb340, offset_blocks=0, num_blocks=1, cb=0x55555556dd5e <write_complete>, cb_arg=0x7fffffffde40) at bdev.c:5219
#6 0x000055555567e188 in spdk_bdev_write (desc=0x555555e600d0, ch=0x555555cca0f0, buf=0x200003aeb340, offset=0, nbytes=512, cb=0x55555556dd5e <write_complete>, cb_arg=0x7fffffffde40) at bdev.c:5211
#7 0x000055555556decc in hello_write (arg=0x7fffffffde40) at hello_bdev.c:139
#8 0x000055555556e4d3 in hello_start (arg1=0x7fffffffde40) at hello_bdev.c:276
#9 0x00005555556683f7 in app_start_application () at app.c:264
#10 0x0000555555668478 in app_start_rpc (rc=0, arg1=0x0) at app.c:285
#11 0x000055555569f259 in app_json_config_load_done (ctx=0x555555c9f000, rc=0) at json_config.c:111
#12 0x000055555569ffa6 in app_json_config_load_subsystem (_ctx=0x555555c9f000) at json_config.c:473
#13 0x00005555556a7bd0 in msg_queue_run_batch (thread=0x555555c9ec00, max_msgs=8) at thread.c:804
#14 0x00005555556a8528 in thread_poll (thread=0x555555c9ec00, max_msgs=0, now=9579853985352) at thread.c:1026
#15 0x00005555556a8837 in spdk_thread_poll (thread=0x555555c9ec00, max_msgs=0, now=9579853985352) at thread.c:1119
#16 0x000055555566d309 in _reactor_run (reactor=0x555555c7b780) at reactor.c:914
#17 0x000055555566d3fb in reactor_run (arg=0x555555c7b780) at reactor.c:952
#18 0x000055555566d887 in spdk_reactors_start () at reactor.c:1068
#19 0x0000555555669c5d in spdk_app_start (opts_user=0x7fffffffdea0, start_fn=0x55555556e1fb <hello_start>, arg1=0x7fffffffde40) at app.c:779
#20 0x000055555556e5d9 in main (argc=5, argv=0x7fffffffe078) at hello_bdev.c:306
(gdb) n
1292 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) {
(gdb) n
1297 bdev->fn_table->submit_request(ioch, bdev_io);
(gdb) p bdev->fn_table
$1 = (const struct spdk_bdev_fn_table *) 0x555555a10c80 <nvmelib_fn_table>

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
static const struct spdk_bdev_fn_table nvmelib_fn_table = {
.destruct = bdev_nvme_destruct,
.submit_request = bdev_nvme_submit_request,
.io_type_supported = bdev_nvme_io_type_supported,
.get_io_channel = bdev_nvme_get_io_channel,
.dump_info_json = bdev_nvme_dump_info_json,
.write_config_json = bdev_nvme_write_config_json,
.get_spin_time = bdev_nvme_get_spin_time,
.get_module_ctx = bdev_nvme_get_module_ctx,
.get_memory_domains = bdev_nvme_get_memory_domains,
.reset_device_stat = bdev_nvme_reset_device_stat,
.dump_device_stat_json = bdev_nvme_dump_device_stat_json,
};

static struct spdk_bdev_module nvme_if = {
.name = "nvme",
.async_fini = true,
.module_init = bdev_nvme_library_init,
.module_fini = bdev_nvme_library_fini,
.config_json = bdev_nvme_config_json,
.get_ctx_size = bdev_nvme_get_ctx_size,

};
// 省略大部分代码
static int
nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
uint32_t prchk_flags, void *ctx)
{
disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr);
disk->ctxt = ctx;
disk->fn_table = &nvmelib_fn_table;
disk->module = &nvme_if;

return 0;
}

Thread 1 "reactor_0" hit Breakpoint 2, nvme_disk_create (disk=0x770000007c, base_name=0x0, ctrlr=0x0, ns=0x1, prchk_flags=57, ctx=0x9) at bdev_nvme.c:3274
3274 {
(gdb) bt
#0 nvme_disk_create (disk=0x770000007c, base_name=0x0, ctrlr=0x0, ns=0x1, prchk_flags=57, ctx=0x9) at bdev_nvme.c:3274
#1 0x000055555557ab49 in nvme_bdev_create (nvme_ctrlr=0x555555cd7b10, nvme_ns=0x555555cd82d0) at bdev_nvme.c:3429
#2 0x000055555557b883 in nvme_ctrlr_populate_namespace (nvme_ctrlr=0x555555cd7b10, nvme_ns=0x555555cd82d0) at bdev_nvme.c:3752
#3 0x000055555557bdac in nvme_ctrlr_populate_namespaces (nvme_ctrlr=0x555555cd7b10, ctx=0x555555cc9f50) at bdev_nvme.c:3911
#4 0x000055555557cdcf in nvme_ctrlr_create_done (nvme_ctrlr=0x555555cd7b10, ctx=0x555555cc9f50) at bdev_nvme.c:4387
#5 0x000055555557d7cd in nvme_ctrlr_create (ctrlr=0x2000162ec0c0, name=0x555555cc0650 "Nvme0", trid=0x555555cc9f78, ctx=0x555555cc9f50) at bdev_nvme.c:4628
#6 0x000055555557e779 in connect_attach_cb (cb_ctx=0x555555cca1c0, trid=0x2000162ec0e8, ctrlr=0x2000162ec0c0, opts=0x2000162ed6c8) at bdev_nvme.c:5054
#7 0x00005555555f5271 in nvme_ctrlr_poll_internal (ctrlr=0x2000162ec0c0, probe_ctx=0x555555cca520) at nvme.c:737
#8 0x00005555555f743a in spdk_nvme_probe_poll_async (probe_ctx=0x555555cca520) at nvme.c:1510
#9 0x000055555557e856 in bdev_nvme_async_poll (arg=0x555555cc9f50) at bdev_nvme.c:5089
#10 0x00005555556a82fd in thread_execute_timed_poller (thread=0x555555c9ec00, poller=0x555555cd7940, now=13419228290350) at thread.c:970
#11 0x00005555556a8613 in thread_poll (thread=0x555555c9ec00, max_msgs=0, now=13419228290350) at thread.c:1060
#12 0x00005555556a8837 in spdk_thread_poll (thread=0x555555c9ec00, max_msgs=0, now=13419228290350) at thread.c:1119
#13 0x000055555566d309 in _reactor_run (reactor=0x555555c7b780) at reactor.c:914
#14 0x000055555566d3fb in reactor_run (arg=0x555555c7b780) at reactor.c:952
--Type <RET> for more, q to quit, c to continue without paging--
#15 0x000055555566d887 in spdk_reactors_start () at reactor.c:1068
#16 0x0000555555669c5d in spdk_app_start (opts_user=0x7fffffffdea0, start_fn=0x55555556e1fb <hello_start>, arg1=0x7fffffffde40) at app.c:779
#17 0x000055555556e5d9 in main (argc=5, argv=0x7fffffffe078) at hello_bdev.c:306

// -b Nvme0n1的原因
(gdb) p disk->name
$2 = 0x555555c7a0f0 "Nvme0n1"
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
 ./build/examples/hello_bdev -c examples/bdev/hello_world/nvme.json -b Nvme0n1 -L all
[2023-06-23 11:54:08.054234] json_config.c: 383:app_json_config_load_subsystem_config_entry: *DEBUG*: params: {
"trtype": "PCIe",
"name": "Nvme0",
"traddr": "0000:03:00.0"
}

[2023-06-23 11:54:08.111049] jsonrpc_client.c: 67:jsonrpc_parse_response: *DEBUG*: JSON string is :
{"jsonrpc":"2.0","id":0,"result":["Nvme0n1"]}

[2023-06-23 11:54:08.111082] json_config.c: 383:app_json_config_load_subsystem_config_entry: *DEBUG*: params: {
"trtype": "PCIe",
"name": "Nvme1",
"traddr": "0000:0b:00.0"
}
{"jsonrpc":"2.0","id":0,"result":[]}
[2023-06-23 11:54:08.160526] json_config.c: 383:app_json_config_load_subsystem_config_entry: *DEBUG*: params: {
"trtype": "PCIe",
"name": "Nvme2",
"traddr": "0000:13:00.0"
}
{"jsonrpc":"2.0","id":0,"result":[]}

[2023-06-23 11:54:08.211102] hello_bdev.c: 138:hello_write: *NOTICE*: Writing to the bdev
[2023-06-23 11:54:08.211328] bdev_nvme.c:6613:bdev_nvme_writev: *DEBUG*: write 1 blocks with offset 0
[2023-06-23 11:54:08.211363] nvme_qpair.c: 474:spdk_nvme_print_completion: *NOTICE*: SUCCESS (00/00) qid:0 cid:186 cdw0:0 sqhd:000d p:1 m:0 dnr:0
[2023-06-23 11:54:08.211496] nvme_qpair.c: 223:nvme_admin_qpair_print_command: *NOTICE*: CREATE IO SQ (01) qid:0 cid:191 nsid:0 cdw10:00ff0001 cdw11:00010001 PRP1 0x18a608000 PRP2 0x0
[2023-06-23 11:54:08.213185] nvme_qpair.c: 474:spdk_nvme_print_completion: *NOTICE*: SUCCESS (00/00) qid:0 cid:191 cdw0:0 sqhd:000e p:1 m:0 dnr:0
[2023-06-23 11:54:08.213517] nvme_pcie_common.c:1207:nvme_pcie_prp_list_append: *DEBUG*: prp_index:0 virt_addr:0x200003aec0c0 len:512
[2023-06-23 11:54:08.213555] nvme_pcie_common.c:1235:nvme_pcie_prp_list_append: *DEBUG*: prp1 = 0x190cec0c0
[2023-06-23 11:54:08.213573] nvme_qpair.c: 243:nvme_io_qpair_print_command: *NOTICE*: WRITE sqid:1 cid:191 nsid:1 lba:0 len:1 PRP1 0x190cec0c0 PRP2 0x0
[2023-06-23 11:54:08.213836] nvme_qpair.c: 474:spdk_nvme_print_completion: *NOTICE*: SUCCESS (00/00) qid:1 cid:191 cdw0:0 sqhd:0001 p:1 m:0 dnr:0
[2023-06-23 11:54:08.213967] hello_bdev.c: 117:write_complete: *NOTICE*: bdev io write completed successfully
[2023-06-23 11:54:08.214070] hello_bdev.c: 84:hello_read: *NOTICE*: Reading io
[2023-06-23 11:54:08.214150] bdev_nvme.c:6567:bdev_nvme_readv: *DEBUG*: read 1 blocks with offset 0
[2023-06-23 11:54:08.214177] nvme_pcie_common.c:1207:nvme_pcie_prp_list_append: *DEBUG*: prp_index:0 virt_addr:0x200003aec0c0 len:512
[2023-06-23 11:54:08.214187] nvme_pcie_common.c:1235:nvme_pcie_prp_list_append: *DEBUG*: prp1 = 0x190cec0c0
[2023-06-23 11:54:08.214200] nvme_qpair.c: 243:nvme_io_qpair_print_command: *NOTICE*: READ sqid:1 cid:190 nsid:1 lba:0 len:1 PRP1 0x190cec0c0 PRP2 0x0
[2023-06-23 11:54:08.214363] nvme_qpair.c: 474:spdk_nvme_print_completion: *NOTICE*: SUCCESS (00/00) qid:1 cid:190 cdw0:0 sqhd:0002 p:1 m:0 dnr:0
[2023-06-23 11:54:08.214492] hello_bdev.c: 65:read_complete: *NOTICE*: Read string from bdev : Hello World!

文档摘录

spdk采用轮询而不是中断的原因:
1)大部分硬件设计不支持用户空间中断机制
2)中断会引发上下文切换,产生比较大的开销,轮询由于只需通过主机内存而不是MMIO查看相应位是否发生翻转,一些技术例如intel的DDIO可以保证这部分主机内存在于CPU缓存中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
// 4.19版本nvme 驱动
152 /*
153 * An NVM Express queue. Each device has at least two (one for admin
154 * commands and one for I/O commands).
155 */
156 struct nvme_queue {
157 struct device *q_dmadev;
158 struct nvme_dev *dev;
159 spinlock_t sq_lock;
160 struct nvme_command *sq_cmds; // SQ内存地址
161 struct nvme_command __iomem *sq_cmds_io; // 使用CMB的SQ IO地址
162 spinlock_t cq_lock ____cacheline_aligned_in_smp;
163 volatile struct nvme_completion *cqes; // CQ内存地址
164 struct blk_mq_tags **tags;
165 dma_addr_t sq_dma_addr; // SQ总线地址
166 dma_addr_t cq_dma_addr; // CQ总线地址
167 u32 __iomem *q_db; // DB寄存器 IO地址
168 u16 q_depth;
169 s16 cq_vector;
170 u16 sq_tail; // 主机能写的两个DB寄存器的值
171 u16 cq_head;
172 u16 last_cq_head;
173 u16 qid;
174 u8 cq_phase;
175 u32 *dbbuf_sq_db;
176 u32 *dbbuf_cq_db;
177 u32 *dbbuf_sq_ei;
178 u32 *dbbuf_cq_ei;
179 };


可乐学习NVMe之二:三只熊SQ/CQ/DB

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
// spdk相关的CQ轮询代码
int32_t
nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
{
while (1) {
cpl = &pqpair->cpl[pqpair->cq_head];

if (!next_is_valid && cpl->status.p != pqpair->flags.phase) {
break;
}

if (spdk_likely(pqpair->cq_head + 1 != pqpair->num_entries)) {
next_cq_head = pqpair->cq_head + 1;
next_phase = pqpair->flags.phase;
} else {
next_cq_head = 0;
next_phase = !pqpair->flags.phase;
}
next_cpl = &pqpair->cpl[next_cq_head];
next_is_valid = (next_cpl->status.p == next_phase);
if (next_is_valid) {
__builtin_prefetch(&pqpair->tr[next_cpl->cid]);
}

tr = &pqpair->tr[cpl->cid];
pqpair->sq_head = cpl->sqhd;
__builtin_prefetch(&tr->req->stailq);
nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true);
if (++num_completions == max_completions) {
break;
}
}
}

SPDK 驱动程序选择将硬件队列直接暴露给应用程序,并要求一次只能从一个线程访问硬件队列。实际上,应用程序为每个线程分配一个硬件队列(而不是内核驱动程序中每个核心一个硬件队列)。这保证了线程可以提交请求,而不必与系统中的其他线程执行任何类型的协调(即锁定)。
SPDK(存储性能开发套件)官方文档中文版

SDC摘录


Educational Library

SPDK NVMe: An In-depth Look at its Architecture and Design

PPT与SPDK代码简单对应

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// env.h
/**
* Enumerate all PCI devices supported by the provided driver and try to
* attach those that weren't attached yet. The provided callback will be
* called for each such device and its return code will decide whether that
* device is attached or not. Attached devices have to be manually detached
* with spdk_pci_device_detach() to be attach-able again.
*
* During enumeration all registered pci devices with exposed access to
* userspace are getting probed internally unless not explicitly specified
* on denylist. Because of that it becomes not possible to either use such
* devices with another application or unbind the driver (e.g. vfio).
*
* 2s asynchronous delay is introduced to avoid race conditions between
* user space software initialization and in-kernel device handling for
* newly inserted devices. Subsequent enumerate call after the delay
* shall allow for a successful device attachment.
*
* \param driver Driver for a specific device type.
* \param enum_cb Callback to be called for each non-attached PCI device.
* \param enum_ctx Additional context passed to the callback function.
*
* \return -1 if an internal error occurred or the provided callback returned -1,
* 0 otherwise
*/
int spdk_pci_enumerate(struct spdk_pci_driver *driver, spdk_pci_enum_cb enum_cb, void *enum_ctx);

/**
* Allocate dma/sharable memory based on a given dma_flg. It is a memory buffer
* with the given size, alignment and socket id.
*
* \param size Size in bytes.
* \param align If non-zero, the allocated buffer is aligned to a multiple of
* align. In this case, it must be a power of two. The returned buffer is always
* aligned to at least cache line size.
* \param phys_addr **Deprecated**. Please use spdk_vtophys() for retrieving physical
* addresses. A pointer to the variable to hold the physical address of
* the allocated buffer is passed. If NULL, the physical address is not returned.
* \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY
* for any socket.
* \param flags Combination of SPDK_MALLOC flags (\ref SPDK_MALLOC_DMA, \ref SPDK_MALLOC_SHARE).
* At least one flag must be specified.
*
* \return a pointer to the allocated memory buffer.
*/
void *spdk_malloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags);

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
// 省略一些成员
struct spdk_nvme_transport_ops {
struct spdk_nvme_ctrlr *(*ctrlr_construct)(const struct spdk_nvme_transport_id *trid,
const struct spdk_nvme_ctrlr_opts *opts,
void *devhandle);
int (*ctrlr_destruct)(struct spdk_nvme_ctrlr *ctrlr);

int (*ctrlr_set_reg_4)(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value);

int (*ctrlr_get_reg_4)(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value);

struct spdk_nvme_qpair *(*ctrlr_create_io_qpair)(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
const struct spdk_nvme_io_qpair_opts *opts);

int (*ctrlr_delete_io_qpair)(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair);

int (*qpair_submit_request)(struct spdk_nvme_qpair *qpair, struct nvme_request *req);

int32_t (*qpair_process_completions)(struct spdk_nvme_qpair *qpair, uint32_t max_completions);
};

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/**
* Enumerate the bus indicated by the transport ID and attach the userspace NVMe
* driver to each device found if desired.
*
* This function is not thread safe and should only be called from one thread at
* a time while no other threads are actively using any NVMe devices.
*
* If called from a secondary process, only devices that have been attached to
* the userspace driver in the primary process will be probed.
*
* If called more than once, only devices that are not already attached to the
* SPDK NVMe driver will be reported.
*
* To stop using the the controller and release its associated resources,
* call spdk_nvme_detach() with the spdk_nvme_ctrlr instance from the attach_cb()
* function.
*
* \param trid The transport ID indicating which bus to enumerate. If the trtype
* is PCIe or trid is NULL, this will scan the local PCIe bus. If the trtype is
* RDMA, the traddr and trsvcid must point at the location of an NVMe-oF discovery
* service.
* \param cb_ctx Opaque value which will be passed back in cb_ctx parameter of
* the callbacks.
* \param probe_cb will be called once per NVMe device found in the system.
* \param attach_cb will be called for devices for which probe_cb returned true
* once that NVMe controller has been attached to the userspace driver.
* \param remove_cb will be called for devices that were attached in a previous
* spdk_nvme_probe() call but are no longer attached to the system. Optional;
* specify NULL if removal notices are not desired.
*
* \return 0 on success, -1 on failure.
*/
int spdk_nvme_probe(const struct spdk_nvme_transport_id *trid,
void *cb_ctx,
spdk_nvme_probe_cb probe_cb,
spdk_nvme_attach_cb attach_cb,
spdk_nvme_remove_cb remove_cb);

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
/**
* Allocate an I/O queue pair (submission and completion queue).
*
* This function by default also performs any connection activities required for
* a newly created qpair. To avoid that behavior, the user should set the create_only
* flag in the opts structure to true.
*
* Each queue pair should only be used from a single thread at a time (mutual
* exclusion must be enforced by the user).
*
* \param ctrlr NVMe controller for which to allocate the I/O queue pair.
* \param opts I/O qpair creation options, or NULL to use the defaults as returned
* by spdk_nvme_ctrlr_get_default_io_qpair_opts().
* \param opts_size Must be set to sizeof(struct spdk_nvme_io_qpair_opts), or 0
* if opts is NULL.
*
* \return a pointer to the allocated I/O queue pair.
*/
struct spdk_nvme_qpair *spdk_nvme_ctrlr_alloc_io_qpair(struct spdk_nvme_ctrlr *ctrlr,
const struct spdk_nvme_io_qpair_opts *opts,
size_t opts_size);

// 省略一些成员
/**
* NVMe I/O queue pair initialization options.
*
* These options may be passed to spdk_nvme_ctrlr_alloc_io_qpair() to configure queue pair
* options at queue creation time.
*
* The user may retrieve the default I/O queue pair creation options for a controller using
* spdk_nvme_ctrlr_get_default_io_qpair_opts().
*/
struct spdk_nvme_io_qpair_opts {
/**
* Queue priority for weighted round robin arbitration. If a different arbitration
* method is in use, pass 0.
*/
enum spdk_nvme_qprio qprio;

/**
* The queue depth of this NVMe I/O queue. Overrides spdk_nvme_ctrlr_opts::io_queue_size.
*/
uint32_t io_queue_size;

/**
* The number of requests to allocate for this NVMe I/O queue.
*
* Overrides spdk_nvme_ctrlr_opts::io_queue_requests.
*
* This should be at least as large as io_queue_size.
*
* A single I/O may allocate more than one request, since splitting may be
* necessary to conform to the device's maximum transfer size, PRP list
* compatibility requirements, or driver-assisted striping.
*/
uint32_t io_queue_requests;

} __attribute__((packed));

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
/**
* \brief Submits a read I/O to the specified NVMe namespace.
*
* The command is submitted to a qpair allocated by spdk_nvme_ctrlr_alloc_io_qpair().
* The user must ensure that only one thread submits I/O on a given qpair at any
* given time.
*
* \param ns NVMe namespace to submit the read I/O.
* \param qpair I/O queue pair to submit the request.
* \param payload Virtual address pointer to the data payload.
* \param lba Starting LBA to read the data.
* \param lba_count Length (in sectors) for the read operation.
* \param cb_fn Callback function to invoke when the I/O is completed.
* \param cb_arg Argument to pass to the callback function.
* \param io_flags Set flags, defined in nvme_spec.h, for this I/O.
*
* \return 0 if successfully submitted, negated errnos on the following error conditions:
* -EINVAL: The request is malformed.
* -ENOMEM: The request cannot be allocated.
* -ENXIO: The qpair is failed at the transport level.
*/
int spdk_nvme_ns_cmd_read(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *payload,
uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn,
void *cb_arg, uint32_t io_flags);

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
/**
* Process any outstanding completions for I/O submitted on a queue pair.
*
* This call is non-blocking, i.e. it only processes completions that are ready
* at the time of this function call. It does not wait for outstanding commands
* to finish.
*
* For each completed command, the request's callback function will be called if
* specified as non-NULL when the request was submitted.
*
* The caller must ensure that each queue pair is only used from one thread at a
* time.
*
* This function may be called at any point while the controller is attached to
* the SPDK NVMe driver.
*
* \sa spdk_nvme_cmd_cb
*
* \param qpair Queue pair to check for completions.
* \param max_completions Limit the number of completions to be processed in one
* call, or 0 for unlimited.
*
* \return number of completions processed (may be 0) or negated on error. -ENXIO
* in the special case that the qpair is failed at the transport layer.
*/
int32_t spdk_nvme_qpair_process_completions(struct spdk_nvme_qpair *qpair,
uint32_t max_completions);

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
// qpair->transport->ops.qpair_submit_request(qpair, req);
struct nvme_request {
struct spdk_nvme_cmd cmd;

uint8_t retries;

uint8_t timed_out : 1;

/**
* True if the request is in the queued_req list.
*/
uint8_t queued : 1;
uint8_t reserved : 6;

/**
* Number of children requests still outstanding for this
* request which was split into multiple child requests.
*/
uint16_t num_children;

/**
* Offset in bytes from the beginning of payload for this request.
* This is used for I/O commands that are split into multiple requests.
*/
uint32_t payload_offset;
uint32_t md_offset;

uint32_t payload_size;

/**
* Timeout ticks for error injection requests, can be extended in future
* to support per-request timeout feature.
*/
uint64_t timeout_tsc;

/**
* Data payload for this request's command.
*/
struct nvme_payload payload;

spdk_nvme_cmd_cb cb_fn;
void *cb_arg;
STAILQ_ENTRY(nvme_request) stailq;

struct spdk_nvme_qpair *qpair;

/*
* The value of spdk_get_ticks() when the request was submitted to the hardware.
* Only set if ctrlr->timeout_enabled is true.
*/
uint64_t submit_tick;

/**
* The active admin request can be moved to a per process pending
* list based on the saved pid to tell which process it belongs
* to. The cpl saves the original completion information which
* is used in the completion callback.
* NOTE: these below two fields are only used for admin request.
*/
pid_t pid;
struct spdk_nvme_cpl cpl;

uint32_t md_size;

/**
* The following members should not be reordered with members
* above. These members are only needed when splitting
* requests which is done rarely, and the driver is careful
* to not touch the following fields until a split operation is
* needed, to avoid touching an extra cacheline.
*/

/**
* Points to the outstanding child requests for a parent request.
* Only valid if a request was split into multiple children
* requests, and is not initialized for non-split requests.
*/
TAILQ_HEAD(, nvme_request) children;

/**
* Linked-list pointers for a child request in its parent's list.
*/
TAILQ_ENTRY(nvme_request) child_tailq;

/**
* Points to a parent request if part of a split request,
* NULL otherwise.
*/
struct nvme_request *parent;

/**
* Completion status for a parent request. Initialized to all 0's
* (SUCCESS) before child requests are submitted. If a child
* request completes with error, the error status is copied here,
* to ensure that the parent request is also completed with error
* status once all child requests are completed.
*/
struct spdk_nvme_cpl parent_status;

/**
* The user_cb_fn and user_cb_arg fields are used for holding the original
* callback data when using nvme_allocate_request_user_copy.
*/
spdk_nvme_cmd_cb user_cb_fn;
void *user_cb_arg;
void *user_buffer;
};

1
2
3
4
5
6
7
8
9
10
11
struct __attribute__((packed)) spdk_nvme_ctrlr_data {
/* bytes 0-255: controller capabilities and features */

/** maximum data transfer size */
uint8_t mdts;
}

struct spdk_nvme_ns_data {
/** namespace optimal I/O boundary in logical blocks */
uint16_t noiob;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
/**
* Submit a write I/O to the specified NVMe namespace.
*
* The command is submitted to a qpair allocated by spdk_nvme_ctrlr_alloc_io_qpair().
* The user must ensure that only one thread submits I/O on a given qpair at any
* given time.
*
* \param ns NVMe namespace to submit the write I/O.
* \param qpair I/O queue pair to submit the request.
* \param lba Starting LBA to write the data.
* \param lba_count Length (in sectors) for the write operation.
* \param cb_fn Callback function to invoke when the I/O is completed.
* \param cb_arg Argument to pass to the callback function.
* \param io_flags Set flags, defined in nvme_spec.h, for this I/O.
* \param reset_sgl_fn Callback function to reset scattered payload.
* \param next_sge_fn Callback function to iterate each scattered payload memory
* segment.
*
* \return 0 if successfully submitted, negated errnos on the following error conditions:
* -EINVAL: The request is malformed.
* -ENOMEM: The request cannot be allocated.
* -ENXIO: The qpair is failed at the transport level.
*/
int spdk_nvme_ns_cmd_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
uint64_t lba, uint32_t lba_count,
spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
spdk_nvme_req_next_sge_cb next_sge_fn);

SPDK Blobstore: A Look Inside the NVM Optimized Allocator

PPT中介绍的代码:spdk/examples/blob/hello_world/hello_blob.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
./build/examples/hello_blob  ./examples/blob/hello_world/hello_blob.json
[2023-06-20 18:58:03.591974] Starting SPDK v23.05-pre git sha1 ee06693c3 / DPDK 22.11.1 initialization...
[2023-06-20 18:58:03.592772] [ DPDK EAL parameters: hello_blob --no-shconf -c 0x1 --huge-unlink --log-level=lib.eal:6 --log-level=lib.cryptodev:5 --log-level=user1:6 --iova-mode=pa --base-virtaddr=0x200000000000 --match-allocations --file-prefix=spdk_pid59413 ]
TELEMETRY: No legacy callbacks, legacy socket not created
[2023-06-20 18:58:03.732576] app.c: 738:spdk_app_start: *NOTICE*: Total cores available: 1
[2023-06-20 18:58:04.022598] reactor.c: 937:reactor_run: *NOTICE*: Reactor started on core 0
[2023-06-20 18:58:04.420790] accel_sw.c: 601:sw_accel_module_init: *NOTICE*: Accel framework software module initialized.
[2023-06-20 18:58:04.692072] hello_blob.c: 386:hello_start: *NOTICE*: entry
[2023-06-20 18:58:04.695777] hello_blob.c: 345:bs_init_complete: *NOTICE*: entry
[2023-06-20 18:58:04.696207] hello_blob.c: 353:bs_init_complete: *NOTICE*: blobstore: 0x563aa7539de0
[2023-06-20 18:58:04.696298] hello_blob.c: 332:create_blob: *NOTICE*: entry
[2023-06-20 18:58:04.696457] hello_blob.c: 311:blob_create_complete: *NOTICE*: entry
[2023-06-20 18:58:04.696579] hello_blob.c: 319:blob_create_complete: *NOTICE*: new blob id 4294967296
[2023-06-20 18:58:04.696689] hello_blob.c: 280:open_complete: *NOTICE*: entry
[2023-06-20 18:58:04.696801] hello_blob.c: 290:open_complete: *NOTICE*: blobstore has FREE clusters of 15
[2023-06-20 18:58:04.696911] hello_blob.c: 256:resize_complete: *NOTICE*: resized blob now has USED clusters of 15
[2023-06-20 18:58:04.697014] hello_blob.c: 233:sync_complete: *NOTICE*: entry
[2023-06-20 18:58:04.697098] hello_blob.c: 195:blob_write: *NOTICE*: entry
[2023-06-20 18:58:04.697263] hello_blob.c: 178:write_complete: *NOTICE*: entry
[2023-06-20 18:58:04.697386] hello_blob.c: 153:read_blob: *NOTICE*: entry
[2023-06-20 18:58:04.697475] hello_blob.c: 126:read_complete: *NOTICE*: entry
[2023-06-20 18:58:04.697568] hello_blob.c: 140:read_complete: *NOTICE*: read SUCCESS and data matches!
[2023-06-20 18:58:04.697703] hello_blob.c: 106:delete_blob: *NOTICE*: entry
[2023-06-20 18:58:04.699957] hello_blob.c: 87:delete_complete: *NOTICE*: entry
[2023-06-20 18:58:04.700400] hello_blob.c: 50:unload_complete: *NOTICE*: entry
[2023-06-20 18:58:04.783541] hello_blob.c: 459:main: *NOTICE*: SUCCESS!

配置文件并不用-c指定而是硬编码

1
2
3
4
5
6
7
8
/*
* Setup a few specifics before we init, for most SPDK cmd line
* apps, the config file will be passed in as an arg but to make
* this example super simple we just hardcode it. We also need to
* specify a name for the app.
*/
opts.name = "hello_blob";
opts.json_config_file = argv[1];

hello_blob.c简单易懂,全都是回调函数,由下至上看过去就可以了,主要就是看看各个api的使用

  Message passing is efficient, but it results in asynchronous code. Unfortunately, asynchronous code is a challenge in C. It’s often implemented by passing function pointers that are called when an operation completes. This chops up the code so that it isn’t easy to follow, especially through logic branches. The best solution is to use a language with support for futures and promises, such as C++, Rust, Go, or almost any other higher level language. However, SPDK is a low level library and requires very wide compatibility and portability, so we’ve elected to stay with plain old C.
  We do have a few recommendations to share, though. For simple callback chains, it’s easiest if you write the functions from bottom to top. By that we mean if function foo performs some asynchronous operation and when that completes function bar is called, then function bar performs some operation that calls function baz on completion, a good way to write it is as such:

1
2
3
4
5
6
7
8
9
10
11
void baz(void *ctx) {
...
}

void bar(void *ctx) {
async_op(baz, ctx);
}

void foo(void *ctx) {
async_op(bar, ctx);
}

  Don’t split these functions up - keep them as a nice unit that can be read from bottom to top.

看了event的代码与文档
spdk/include/spdk/env.h
/spdk/doc/event.md

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
/**
* Start the framework.
*
* Before calling this function, opts must be initialized by
* spdk_app_opts_init(). Once started, the framework will call start_fn on
* an spdk_thread running on the current system thread with the
* argument provided.
*
* If opts->delay_subsystem_init is set
* (e.g. through --wait-for-rpc flag in spdk_app_parse_args())
* this function will only start a limited RPC server accepting
* only a few RPC commands - mostly related to pre-initialization.
* With this option, the framework won't be started and start_fn
* won't be called until the user sends an `rpc_framework_start_init`
* RPC command, which marks the pre-initialization complete and
* allows start_fn to be finally called.
*
* This call will block until spdk_app_stop() is called. If an error
* condition occurs during the initialization code within spdk_app_start(),
* this function will immediately return before invoking start_fn.
*
* \param opts_user Initialization options used for this application. It should not be
* NULL. And the opts_size value inside the opts structure should not be zero.
* \param start_fn Entry point that will execute on an internally created thread
* once the framework has been started.
* \param ctx Argument passed to function start_fn.
*
* \return 0 on success or non-zero on failure.
*/
int spdk_app_start(struct spdk_app_opts *opts_user, spdk_msg_fn start_fn,
void *ctx);

/**
* Perform final shutdown operations on an application using the event framework.
*/
void spdk_app_fini(void);

/**
* Start shutting down the framework.
*
* Typically this function is not called directly, and the shutdown process is
* started implicitly by a process signal. But in applications that are using
* SPDK for a subset of its process threads, this function can be called in lieu
* of a signal.
*/
void spdk_app_start_shutdown(void);
/**
* Stop the framework.
*
* This does not wait for all threads to exit. Instead, it kicks off the shutdown
* process and returns. Once the shutdown process is complete, spdk_app_start()
* will return.
*
* \param rc The rc value specified here will be returned to caller of spdk_app_start().
*/
void spdk_app_stop(int rc);

spdk+rocksdb

参考:RocksDB Integration

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 需安装gflags后再编译rocksdb
git clone https://github.com/gflags/gflags.git
cd gflags
mkdir build
cd build
cmake -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON -DINSTALL_HEADERS=ON -DINSTALL_SHARED_LIBS=ON -DINSTALL_STATIC_LIBS=ON ..
make && make install
# 编译生成db_bench可执行文件
cd ../..
git clone -b 6.15.fb https://github.com/spdk/rocksdb.git
cd rocksdb
make db_bench SPDK_DIR=../spdk

# 创建blobfs
scripts/gen_nvme.sh --json-with-subsystems > ../rocksdb.json
test/blobfs/mkfs/mkfs ../rocksdb.json Nvme0n1

文件夹示意图

Ubuntu20.04安装gflags
若未安装gflags则执行db_bench显示如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
 ./db_bench 
Please install gflags to run rocksdb tools

// 相关代码:db_bench.cc
#ifndef GFLAGS
#include <cstdio>
int main() {
fprintf(stderr, "Please install gflags to run rocksdb tools\n");
return 1;
}
#else
#include <rocksdb/db_bench_tool.h>
int main(int argc, char** argv) {
return ROCKSDB_NAMESPACE::db_bench_tool(argc, argv);
}
#endif // GFLAGS

libgflags.so.2.2: cannot open shared object file: No such file or directory

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
./db_bench
./db_bench: error while loading shared libraries: libgflags.so.2.2: cannot open shared object file: No such file or directory
ldd db_bench
linux-vdso.so.1 (0x00007fff14f4b000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007fda223f7000)
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007fda223d4000)
libgflags.so.2.2 => not found
libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 (0x00007fda223b8000)
libnuma.so.1 => /lib/x86_64-linux-gnu/libnuma.so.1 (0x00007fda223ab000)
librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007fda223a1000)
libuuid.so.1 => /lib/x86_64-linux-gnu/libuuid.so.1 (0x00007fda22396000)
libssl.so.1.1 => /lib/x86_64-linux-gnu/libssl.so.1.1 (0x00007fda22303000)
libcrypto.so.1.1 => /lib/x86_64-linux-gnu/libcrypto.so.1.1 (0x00007fda2202d000)
libaio.so.1 => /lib/x86_64-linux-gnu/libaio.so.1 (0x00007fda22028000)
libstdc++.so.6 => /lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007fda21e46000)
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007fda21cf7000)
libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007fda21cda000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fda21ae8000)
/lib64/ld-linux-x86-64.so.2 (0x00007fda23113000)

libgflags.so.2.2路径未知,查看gflags make install输出

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
 make install
[ 25%] Built target gflags_nothreads_shared
[ 50%] Built target gflags_shared
[ 75%] Built target gflags_nothreads_static
[100%] Built target gflags_static
Install the project...
-- Install configuration: "Release"
-- Installing: /usr/local/lib/libgflags.so.2.2.2
-- Installing: /usr/local/lib/libgflags.so.2.2
-- Installing: /usr/local/lib/libgflags.so
-- Installing: /usr/local/lib/libgflags_nothreads.so.2.2.2
-- Installing: /usr/local/lib/libgflags_nothreads.so.2.2
-- Installing: /usr/local/lib/libgflags_nothreads.so
-- Installing: /usr/local/lib/libgflags.a
-- Installing: /usr/local/lib/libgflags_nothreads.a
-- Installing: /usr/local/include/gflags/gflags.h
-- Installing: /usr/local/include/gflags/gflags_declare.h
-- Installing: /usr/local/include/gflags/gflags_completions.h
-- Installing: /usr/local/include/gflags/gflags_gflags.h
-- Installing: /usr/local/lib/cmake/gflags/gflags-config.cmake
-- Installing: /usr/local/lib/cmake/gflags/gflags-config-version.cmake
-- Installing: /usr/local/lib/cmake/gflags/gflags-targets.cmake
-- Installing: /usr/local/lib/cmake/gflags/gflags-targets-release.cmake
-- Installing: /usr/local/lib/cmake/gflags/gflags-nonamespace-targets.cmake
-- Installing: /usr/local/lib/cmake/gflags/gflags-nonamespace-targets-release.cmake
-- Installing: /usr/local/bin/gflags_completions.sh
-- Installing: /usr/local/lib/pkgconfig/gflags.pc
-- Installing: /root/.cmake/packages/gflags/e5f7ce61772240490d3164df06f58ce9

增加动态库搜索路径:/usr/local/lib

1
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
ldd db_bench 
linux-vdso.so.1 (0x00007ffcdeff2000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f6cfa436000)
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f6cfa413000)
libgflags.so.2.2 => /usr/local/lib/libgflags.so.2.2 (0x00007f6cfa3e6000)
libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 (0x00007f6cfa3ca000)
libnuma.so.1 => /lib/x86_64-linux-gnu/libnuma.so.1 (0x00007f6cfa3bd000)
librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007f6cfa3b3000)
libuuid.so.1 => /lib/x86_64-linux-gnu/libuuid.so.1 (0x00007f6cfa3a8000)
libssl.so.1.1 => /lib/x86_64-linux-gnu/libssl.so.1.1 (0x00007f6cfa315000)
libcrypto.so.1.1 => /lib/x86_64-linux-gnu/libcrypto.so.1.1 (0x00007f6cfa03f000)
libaio.so.1 => /lib/x86_64-linux-gnu/libaio.so.1 (0x00007f6cfa03a000)
libstdc++.so.6 => /lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007f6cf9e58000)
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f6cf9d09000)
libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007f6cf9cec000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f6cf9afa000)
/lib64/ld-linux-x86-64.so.2 (0x00007f6cfb152000)

spdk与内核文件系统简单对比
db_bench spdk相关参数

1
2
3
4
// rocksdb/tools/db_bench_tool.cc
DEFINE_string(spdk, "", "Name of SPDK configuration file");
DEFINE_string(spdk_bdev, "", "Name of SPDK blockdev to load");
DEFINE_uint64(spdk_cache_size, 4096, "Size of SPDK filesystem cache (in MB)");
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
root@driver-virtual-machine ~/s/rocksdb (6.15.fb)# ./db_bench --spdk=../rocksdb.json --spdk_bdev=Nvme0n1  --benchmarks=fillrandom --num=1000000 --compression_type=none --spdk_cache_size=1024
[2023-06-20 22:18:35.512845] Starting SPDK v23.05-pre git sha1 ee06693c3 / DPDK 22.11.1 initialization...
[2023-06-20 22:18:35.513502] [ DPDK EAL parameters: rocksdb --no-shconf -c 0x1 --huge-unlink --log-level=lib.eal:6 --log-level=lib.cryptodev:5 --log-level=user1:6 --iova-mode=pa --base-virtaddr=0x200000000000 --match-allocations --file-prefix=spdk_pid136961 ]
TELEMETRY: No legacy callbacks, legacy socket not created
[2023-06-20 22:18:35.634516] app.c: 738:spdk_app_start: *NOTICE*: Total cores available: 1
[2023-06-20 22:18:35.685057] app.c: 447:app_setup_trace: *NOTICE*: Tracepoint Group Mask 0x80 specified.
[2023-06-20 22:18:35.685576] app.c: 448:app_setup_trace: *NOTICE*: Use 'spdk_trace -s rocksdb -p 136961' to capture a snapshot of events at runtime.
[2023-06-20 22:18:35.687013] app.c: 453:app_setup_trace: *NOTICE*: Or copy /dev/shm/rocksdb_trace.pid136961 for offline analysis/debug.
[2023-06-20 22:18:35.687997] reactor.c: 937:reactor_run: *NOTICE*: Reactor started on core 0
[2023-06-20 22:18:35.736796] accel_sw.c: 601:sw_accel_module_init: *NOTICE*: Accel framework software module initialized.
using bdev Nvme0n1
Initializing RocksDB Options from the specified file
Initializing RocksDB Options from command-line flags
RocksDB: version 6.15
Date: Tue Jun 20 22:18:36 2023
CPU: 2 * AMD Ryzen 5 4600U with Radeon Graphics
CPUCache: 512 KB
Keys: 16 bytes each (+ 0 bytes user-defined timestamp)
Values: 100 bytes each (50 bytes after compression)
Entries: 1000000
Prefix: 0 bytes
Keys per prefix: 0
RawSize: 110.6 MB (estimated)
FileSize: 62.9 MB (estimated)
Write rate: 0 bytes/second
Read rate: 0 ops/second
Compression: NoCompression
Compression sampling rate: 0
Memtablerep: skip_list
Perf Level: 1
WARNING: Assertions are enabled; benchmarks unnecessarily slow
------------------------------------------------
Initializing RocksDB Options from the specified file
Initializing RocksDB Options from command-line flags
DB path: [/tmp/rocksdbtest-0/dbbench]
fillrandom : 6.511 micros/op 153577 ops/sec; 17.0 MB/s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
root@driver-virtual-machine ~/s/rocksdb (6.15.fb)# ./db_bench   --benchmarks=fillrandom --num=1000000 --compression_type=none 
Initializing RocksDB Options from the specified file
Initializing RocksDB Options from command-line flags
RocksDB: version 6.15
Date: Tue Jun 20 22:47:23 2023
CPU: 2 * AMD Ryzen 5 4600U with Radeon Graphics
CPUCache: 512 KB
Keys: 16 bytes each (+ 0 bytes user-defined timestamp)
Values: 100 bytes each (50 bytes after compression)
Entries: 1000000
Prefix: 0 bytes
Keys per prefix: 0
RawSize: 110.6 MB (estimated)
FileSize: 62.9 MB (estimated)
Write rate: 0 bytes/second
Read rate: 0 ops/second
Compression: NoCompression
Compression sampling rate: 0
Memtablerep: skip_list
Perf Level: 1
WARNING: Assertions are enabled; benchmarks unnecessarily slow
------------------------------------------------
Initializing RocksDB Options from the specified file
Initializing RocksDB Options from command-line flags
DB path: [/tmp/rocksdbtest-0/dbbench]
fillrandom : 8.553 micros/op 116911 ops/sec; 12.9 MB/s

hello_blob

文档与源码简要阅读

分支切到23.09,阅读Blobstore Programmer’s Guide

运行hello_blob程序,查看输出

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
root@osd-node0 ~/l/s/b/examples (v23.09.x)# ./hello_blob /root/xxx/spdk/examples/blob/hello_world/hello_blob.json
[2023-10-30 10:42:55.201033] Starting SPDK v23.09.1-pre git sha1 aa8059716 / DPDK 23.07.0 initialization...
[2023-10-30 10:42:55.201151] [ DPDK EAL parameters: hello_blob --no-shconf -c 0x1 --huge-unlink --log-level=lib.eal:6 --log-level=lib.cryptodev:5 --log-level=user1:6 --iova-mode=pa --base-virtaddr=0x200000000000 --match-allocations --file-prefix=spdk_pid247614 ]
TELEMETRY: No legacy callbacks, legacy socket not created
[2023-10-30 10:42:55.209967] app.c: 786:spdk_app_start: *NOTICE*: Total cores available: 1
[2023-10-30 10:42:55.237068] reactor.c: 937:reactor_run: *NOTICE*: Reactor started on core 0
[2023-10-30 10:42:55.307336] hello_blob.c: 386:hello_start: *NOTICE*: entry
[2023-10-30 10:42:55.308734] hello_blob.c: 345:bs_init_complete: *NOTICE*: entry
[2023-10-30 10:42:55.308751] hello_blob.c: 353:bs_init_complete: *NOTICE*: blobstore: 0x560b5be05540
[2023-10-30 10:42:55.308755] hello_blob.c: 332:create_blob: *NOTICE*: entry
[2023-10-30 10:42:55.308763] hello_blob.c: 311:blob_create_complete: *NOTICE*: entry
[2023-10-30 10:42:55.308766] hello_blob.c: 319:blob_create_complete: *NOTICE*: new blob id 4294967296
[2023-10-30 10:42:55.308772] hello_blob.c: 280:open_complete: *NOTICE*: entry
[2023-10-30 10:42:55.308775] hello_blob.c: 290:open_complete: *NOTICE*: blobstore has FREE clusters of 15
[2023-10-30 10:42:55.308781] hello_blob.c: 256:resize_complete: *NOTICE*: resized blob now has USED clusters of 15
[2023-10-30 10:42:55.308786] hello_blob.c: 233:sync_complete: *NOTICE*: entry
[2023-10-30 10:42:55.308789] hello_blob.c: 195:blob_write: *NOTICE*: entry
[2023-10-30 10:42:55.308794] hello_blob.c: 178:write_complete: *NOTICE*: entry
[2023-10-30 10:42:55.308797] hello_blob.c: 153:read_blob: *NOTICE*: entry
[2023-10-30 10:42:55.308800] hello_blob.c: 126:read_complete: *NOTICE*: entry
[2023-10-30 10:42:55.308803] hello_blob.c: 140:read_complete: *NOTICE*: read SUCCESS and data matches!
[2023-10-30 10:42:55.308807] hello_blob.c: 106:delete_blob: *NOTICE*: entry
[2023-10-30 10:42:55.309275] hello_blob.c: 87:delete_complete: *NOTICE*: entry
[2023-10-30 10:42:55.309289] hello_blob.c: 50:unload_complete: *NOTICE*: entry
[2023-10-30 10:42:55.374886] hello_blob.c: 459:main: *NOTICE*: SUCCESS!

调用API顺序

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
// Create a blobstore block device from a bdev.
rc = spdk_bdev_create_bs_dev_ext("Malloc0", base_bdev_event_cb, NULL, &bs_dev);

// Initialize a blobstore on the given device.
spdk_bs_init(bs_dev, NULL, bs_init_complete, hello_context);

// Create a new blob with default option values on the given blobstore. The new blob id will be passed to the callback function.
spdk_bs_create_blob(hello_context->bs, blob_create_complete, hello_context);

// Open a blob from the given blobstore.
spdk_bs_open_blob(hello_context->bs, hello_context->blobid, open_complete, hello_context);

// Resize a blob to 'sz' clusters. These changes are not persisted to disk until spdk_bs_md_sync_blob() is called. If called before previous resize finish, it will fail with errno -EBUSY
spdk_blob_resize(hello_context->blob, free, resize_complete, hello_context);

// Sync a blob. Make a blob persistent. This applies to open, resize, set xattr, and remove xattr. These operations will not be persistent until the blob has been synced.
spdk_blob_sync_md(hello_context->blob, sync_complete, hello_context);

// Allocate an I/O channel for the given blobstore.
hello_context->channel = spdk_bs_alloc_io_channel(hello_context->bs);

// Write data to a blob.
spdk_blob_io_write(hello_context->blob, hello_context->channel, hello_context->write_buff, 0, 1, write_complete, hello_context);

// Read data from a blob.
spdk_blob_io_read(hello_context->blob, hello_context->channel, hello_context->read_buff, 0, 1, read_complete, hello_context);

// Close a blob. This will automatically sync.
spdk_blob_close(hello_context->blob, delete_blob, hello_context);

// Delete an existing blob from the given blobstore.
spdk_bs_delete_blob(hello_context->bs, hello_context->blobid, delete_complete, hello_context);

// Free the I/O channel.
spdk_bs_free_io_channel(hello_context->channel);

// Unload the blobstore. It will flush all volatile data to disk.
spdk_bs_unload(hello_context->bs, unload_complete, hello_context);


// 其他API

// Get the io unit size in bytes.
hello_context->io_unit_size = spdk_bs_get_io_unit_size(hello_context->bs);

// Get the number of free clusters.
free = spdk_bs_free_cluster_count(hello_context->bs);

// Get the number of clusters allocated to the blob.
total = spdk_blob_get_num_clusters(hello_context->blob);



struct hello_context_t {
struct spdk_blob_store *bs;
struct spdk_blob *blob;
spdk_blob_id blobid;
struct spdk_io_channel *channel;
uint8_t *read_buff;
uint8_t *write_buff;
uint64_t io_unit_size;
int rc;
};

更改打印级别,查看debug输出

1
2
3
4
5
6
7
./configure --enable-debug

static void hello_start(void *arg1) {
spdk_log_set_flag("all");
spdk_log_set_print_level(SPDK_LOG_DEBUG);
// ...
}

问题研究

blob id相关

blob id的分配

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
spdk_bs_create_blob(hello_context->bs, blob_create_complete, hello_context);
bs_create_blob(bs, NULL, NULL, cb_fn, cb_arg);
// 相关代码
spdk_spin_lock(&bs->used_lock);
// 找到第一个为0的位置
page_idx = spdk_bit_array_find_first_clear(bs->used_md_pages, 0);
if (page_idx == UINT32_MAX) {
spdk_spin_unlock(&bs->used_lock);
cb_fn(cb_arg, 0, -ENOMEM);
return;
}
// 将对应位置位
spdk_bit_array_set(bs->used_blobids, page_idx);
bs_claim_md_page(bs, page_idx);
spdk_spin_unlock(&bs->used_lock);
// 计算索引对应id
id = bs_page_to_blobid(page_idx);

SPDK_DEBUGLOG(blob, "Creating blob with id 0x%" PRIx64 " at page %u\n", id, page_idx);

// 输出
blobstore.c:5989:bs_create_blob: *DEBUG*: Creating blob with id 0x100000000 at page 0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
// 涉及的成员变量(两个bitmap)
struct spdk_blob_store {
struct spdk_bit_array *used_md_pages; /* Protected by used_lock */
struct spdk_bit_array *used_blobids;
}

// used_md_pages的大小
if (opts.num_md_pages == SPDK_BLOB_OPTS_NUM_MD_PAGES) {
/* By default, allocate 1 page per cluster.
* Technically, this over-allocates metadata
* because more metadata will reduce the number
* of usable clusters. This can be addressed with
* more complex math in the future.
*/
// 一个cluster一个元数据页
bs->md_len = bs->total_clusters;
} else {
bs->md_len = opts.num_md_pages;
}
rc = spdk_bit_array_resize(&bs->used_md_pages, bs->md_len);
rc = spdk_bit_array_resize(&bs->used_blobids, bs->md_len);

页索引与blob id的匹配

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
// 元数据页索引& (1<<32)即为对应blob id,之所以这样操作是为了避免程序将元数据索引与blob id混用
static inline uint64_t
bs_blobid_to_page(spdk_blob_id id)
{
return id & 0xFFFFFFFF;
}

/* The blob id is a 64 bit number. The lower 32 bits are the page_idx. The upper
* 32 bits are not currently used. Stick a 1 there just to catch bugs where the
* code assumes blob id == page_idx.
*/
static inline spdk_blob_id
bs_page_to_blobid(uint64_t page_idx)
{
if (page_idx > UINT32_MAX) {
return SPDK_BLOBID_INVALID;
}
return SPDK_BLOB_BLOBID_HIGH_BIT | page_idx;
}

由blob id找到对应blob

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
spdk_bs_open_blob(hello_context->bs, hello_context->blobid, open_complete, hello_context);
bs_open_blob(bs, blobid, NULL, cb_fn, cb_arg);
// 相关代码
// 预先检查blob id是否为之前分配的合法id
page_num = bs_blobid_to_page(blobid);
if (spdk_bit_array_get(bs->used_blobids, page_num) == false) {
/* Invalid blobid */
cb_fn(cb_arg, NULL, -ENOENT);
return;
}
// 该blob是否已打开
blob = blob_lookup(bs, blobid);
if (blob) { // blob已存在,增加引用计数并返回
blob->open_ref++;
cb_fn(cb_arg, blob, 0);
return;
}
// 申请blob空间
blob = blob_alloc(bs, blobid);
// 从磁盘读取数据填充blob,略过
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
// 相关成员变量
struct spdk_blob_store {
struct spdk_bit_array *open_blobids;
RB_HEAD(spdk_blob_tree, spdk_blob) open_blobs; // 红黑树
};
// 初始化
rc = spdk_bit_array_resize(&bs->open_blobids, bs->md_len);
RB_INIT(&bs->open_blobs);


static struct spdk_blob *
blob_lookup(struct spdk_blob_store *bs, spdk_blob_id blobid)
{
struct spdk_blob find;
// 在open_blobids位图中查找特定id,0表示不存在
if (spdk_bit_array_get(bs->open_blobids, blobid) == 0) {
return NULL;
}
// 从红黑树中查找对应blob并返回
find.id = blobid;
return RB_FIND(spdk_blob_tree, &bs->open_blobs, &find);
}

static struct spdk_blob *
blob_alloc(struct spdk_blob_store *bs, spdk_blob_id id)
{
struct spdk_blob *blob;

blob = calloc(1, sizeof(*blob));

blob->id = id;
blob->bs = bs;

blob->parent_id = SPDK_BLOBID_INVALID;

blob->state = SPDK_BLOB_STATE_DIRTY;
blob->extent_rle_found = false;
blob->extent_table_found = false;
blob->active.num_pages = 1;
blob->active.pages = calloc(1, sizeof(*blob->active.pages));

blob->active.pages[0] = bs_blobid_to_page(id);

TAILQ_INIT(&blob->xattrs);
TAILQ_INIT(&blob->xattrs_internal);
TAILQ_INIT(&blob->pending_persists);
TAILQ_INIT(&blob->persists_to_complete);

return blob;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
// 磁盘读取相关函数
bs_sequence_read_dev(seq, &ctx->pages[0], lba, bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE), blob_load_cpl, ctx); // 不用研究读取过程,直接当读取已完成,数据已在ctx->pages[0]中,阅读回调函数实现即可

// LBA计算方式
page_num = bs_blobid_to_page(blob->id);
lba = bs_md_page_to_lba(blob->bs, page_num);

static inline uint64_t
bs_md_page_to_lba(struct spdk_blob_store *bs, uint32_t page)
{
return bs_page_to_lba(bs, page + bs->md_start);
}

static inline uint64_t
bs_page_to_lba(struct spdk_blob_store *bs, uint64_t page)
{
return page * SPDK_BS_PAGE_SIZE / bs->dev->blocklen;
}
// 读回调函数
static void
blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
{
struct spdk_blob_load_ctx *ctx = cb_arg;
struct spdk_blob *blob = ctx->blob;
struct spdk_blob_md_page *page;
int rc;
uint32_t crc;
uint32_t current_page;

if (ctx->num_pages == 1) {
current_page = bs_blobid_to_page(blob->id);
} else {
assert(ctx->num_pages != 0);
page = &ctx->pages[ctx->num_pages - 2];
current_page = page->next;
}

/* Parse the pages */
rc = blob_parse(ctx->pages, ctx->num_pages, blob);

if (blob->extent_table_found == true) {
/* If EXTENT_TABLE was found, that means support for it should be enabled. */
assert(blob->extent_rle_found == false);
blob->use_extent_table = true;
} else {
/* If EXTENT_RLE or no extent_* descriptor was found disable support
* for extent table. No extent_* descriptors means that blob has length of 0
* and no extent_rle descriptors were persisted for it.
* EXTENT_TABLE if used, is always present in metadata regardless of length. */
blob->use_extent_table = false;
}

/* Check the clear_method stored in metadata vs what may have been passed
* via spdk_bs_open_blob_ext() and update accordingly.
*/
blob_update_clear_method(blob);

spdk_free(ctx->pages);
ctx->pages = NULL;

if (blob->extent_table_found) {
blob_load_cpl_extents_cpl(seq, ctx, 0);
} else {
blob_load_backing_dev(seq, ctx);
}
}

blob_load(seq, blob, bs_open_blob_cpl, blob);

static void
bs_open_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
{
struct spdk_blob *blob = cb_arg;
struct spdk_blob *existing;

existing = blob_lookup(blob->bs, blob->id);
if (existing) {
blob_free(blob);
existing->open_ref++;
seq->cpl.u.blob_handle.blob = existing;
bs_sequence_finish(seq, 0);
return;
}

blob->open_ref++;
// 更新open_blobids与open_blobs变量
spdk_bit_array_set(blob->bs->open_blobids, blob->id);
RB_INSERT(spdk_blob_tree, &blob->bs->open_blobs, blob);

bs_sequence_finish(seq, bserrno);
}

元数据相关

元数据线程

image-20231102113424114

简而言之,blob层采用了独立的元数据线程来处理元数据相关请求避免加锁

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
struct spdk_blob_store {
struct spdk_thread *md_thread;
};

// 初始化
void
spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o,
spdk_bs_op_with_handle_complete cb_fn, void *cb_arg)
{
rc = bs_alloc(dev, &opts, &bs, &ctx);
}

static int
bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_store **_bs,
struct spdk_bs_load_ctx **_ctx)
{
bs->md_thread = spdk_get_thread();
}

// 元数据线程判断
static void
bs_create_blob(struct spdk_blob_store *bs,
const struct spdk_blob_opts *opts,
const struct spdk_blob_xattr_opts *internal_xattrs,
spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
{
assert(spdk_get_thread() == bs->md_thread);
}

void
spdk_bs_delete_blob(struct spdk_blob_store *bs, spdk_blob_id blobid,
spdk_blob_op_complete cb_fn, void *cb_arg)
{
assert(spdk_get_thread() == bs->md_thread);
}

static void
bs_open_blob(struct spdk_blob_store *bs,
spdk_blob_id blobid,
struct spdk_blob_open_opts *opts,
spdk_blob_op_with_handle_complete cb_fn,
void *cb_arg)
{
assert(spdk_get_thread() == bs->md_thread);
}

static void
blob_verify_md_op(struct spdk_blob *blob)
{
assert(blob != NULL);
assert(spdk_get_thread() == blob->bs->md_thread);
assert(blob->state != SPDK_BLOB_STATE_LOADING);
}

// 向元数据线程发送消息
static void
blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num,
uint64_t cluster, uint32_t extent_page, struct spdk_blob_md_page *page,
spdk_blob_op_complete cb_fn, void *cb_arg)
{
struct spdk_blob_insert_cluster_ctx *ctx;

ctx = calloc(1, sizeof(*ctx));
if (ctx == NULL) {
cb_fn(cb_arg, -ENOMEM);
return;
}

ctx->thread = spdk_get_thread();
ctx->blob = blob;
ctx->cluster_num = cluster_num;
ctx->cluster = cluster;
ctx->extent_page = extent_page;
ctx->page = page;
ctx->cb_fn = cb_fn;
ctx->cb_arg = cb_arg;
// Send a message to the given thread. The message will be sent asynchronously - i.e. spdk_thread_send_msg will always return prior to fn being called.
spdk_thread_send_msg(blob->bs->md_thread, blob_insert_cluster_msg, ctx);
}

blob store元数据

阅读元数据相关代码的入口点

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
void
spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o,
spdk_bs_op_with_handle_complete cb_fn, void *cb_arg)
{
/* Calculate how many pages the metadata consumes at the front
* of the disk.
*/

/* The super block uses 1 page */
num_md_pages = 1;

/* The used_md_pages mask requires 1 bit per metadata page, rounded
* up to the nearest page, plus a header.
*/
ctx->super->used_page_mask_start = num_md_pages;
ctx->super->used_page_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +spdk_divide_round_up(bs->md_len, 8),SPDK_BS_PAGE_SIZE);
num_md_pages += ctx->super->used_page_mask_len;

/* The used_clusters mask requires 1 bit per cluster, rounded
* up to the nearest page, plus a header.
*/
ctx->super->used_cluster_mask_start = num_md_pages;
ctx->super->used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + spdk_divide_round_up(bs->total_clusters, 8), SPDK_BS_PAGE_SIZE);
/* The blobstore might be extended, then the used_cluster bitmap will need more space.
* Here we calculate the max clusters we can support according to the
* num_md_pages (bs->md_len).
*/
max_used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + spdk_divide_round_up(bs->md_len, 8), SPDK_BS_PAGE_SIZE);
max_used_cluster_mask_len = spdk_max(max_used_cluster_mask_len, ctx->super->used_cluster_mask_len);
num_md_pages += max_used_cluster_mask_len;

/* The used_blobids mask requires 1 bit per metadata page, rounded
* up to the nearest page, plus a header.
*/
ctx->super->used_blobid_mask_start = num_md_pages;
ctx->super->used_blobid_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + spdk_divide_round_up(bs->md_len, 8), SPDK_BS_PAGE_SIZE);
num_md_pages += ctx->super->used_blobid_mask_len;

/* The metadata region size was chosen above */
ctx->super->md_start = bs->md_start = num_md_pages;
ctx->super->md_len = bs->md_len;
num_md_pages += bs->md_len;

num_md_lba = bs_page_to_lba(bs, num_md_pages);

ctx->super->size = dev->blockcnt * dev->blocklen;

ctx->super->crc = blob_md_page_calc_crc(ctx->super);

num_md_clusters = spdk_divide_round_up(num_md_pages, bs->pages_per_cluster);
/* Claim all of the clusters used by the metadata */
for (i = 0; i < num_md_clusters; i++) {
spdk_bit_array_set(ctx->used_clusters, i);
}

bs->num_free_clusters -= num_md_clusters;
bs->total_data_clusters = bs->num_free_clusters;
}

可以看出硬盘头部的元数据格式基本如下

1
|super block|used_page_mask|used_cluster_mask|used_blobid_mask|metadata page|data cluster|

super block

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
struct spdk_bs_super_block {
uint8_t signature[8];
uint32_t version;
uint32_t length;
uint32_t clean; /* If there was a clean shutdown, this is 1. */
spdk_blob_id super_blob;

uint32_t cluster_size; /* In bytes */

uint32_t used_page_mask_start; /* Offset from beginning of disk, in pages */
uint32_t used_page_mask_len; /* Count, in pages */

uint32_t used_cluster_mask_start; /* Offset from beginning of disk, in pages */
uint32_t used_cluster_mask_len; /* Count, in pages */

uint32_t md_start; /* Offset from beginning of disk, in pages */
uint32_t md_len; /* Count, in pages */

struct spdk_bs_type bstype; /* blobstore type */

uint32_t used_blobid_mask_start; /* Offset from beginning of disk, in pages */
uint32_t used_blobid_mask_len; /* Count, in pages */

uint64_t size; /* size of blobstore in bytes */
uint32_t io_unit_size; /* Size of io unit in bytes */

uint8_t reserved[4000];
uint32_t crc;
};
SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_super_block) == 0x1000, "Invalid super block size");

成员初始化

1
2
3
4
5
6
7
8
ctx->super->version = SPDK_BS_VERSION;
ctx->super->length = sizeof(*ctx->super);
ctx->super->super_blob = bs->super_blob;
ctx->super->clean = 0;
ctx->super->cluster_size = bs->cluster_sz;
ctx->super->io_unit_size = bs->io_unit_size;
ctx->super->size = dev->blockcnt * dev->blocklen;
ctx->super->crc = blob_md_page_calc_crc(ctx->super);

mask

1
2
3
used_page_mask与used_md_pages对应
used_cluster_mask与used_clusters对应
used_blobid_mask与used_blobids对应
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
之所以长度需加上sizeof(struct spdk_bs_md_mask) 
/* The used_md_pages mask requires 1 bit per metadata page, rounded
* up to the nearest page, plus a header.
*/
ctx->super->used_page_mask_start = num_md_pages;
ctx->super->used_page_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
spdk_divide_round_up(bs->md_len, 8),
SPDK_BS_PAGE_SIZE);
num_md_pages += ctx->super->used_page_mask_len;

原因:
struct spdk_bs_md_mask {
uint8_t type;
uint32_t length; /* In bits */
uint8_t mask[0]; // 零长数组
};

static void
bs_write_used_md(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
{
struct spdk_bs_load_ctx *ctx = arg;
uint64_t mask_size, lba, lba_count;

mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE;
ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);

ctx->mask->type = SPDK_MD_MASK_TYPE_USED_PAGES;
ctx->mask->length = ctx->super->md_len;
assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_md_pages));
// 将used_md_pages位图数据写入mask数组
spdk_bit_array_store_mask(ctx->bs->used_md_pages, ctx->mask->mask);
lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start);
lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len);
// payload为ctx->mask,包含了头部的5字节信息
bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
}

之后的元数据落盘简述

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
void
spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o,
spdk_bs_op_with_handle_complete cb_fn, void *cb_arg)
{
cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE;
cpl.u.bs_handle.cb_fn = cb_fn;
cpl.u.bs_handle.cb_arg = cb_arg;
cpl.u.bs_handle.bs = bs;

seq = bs_sequence_start_bs(bs->md_channel, &cpl);

batch = bs_sequence_to_batch(seq, bs_init_trim_cpl, ctx);

/* Clear metadata space */
bs_batch_write_zeroes_dev(batch, 0, num_md_lba);

lba = num_md_lba;
lba_count = ctx->bs->dev->blockcnt - lba;
switch (opts.clear_method) {
case BS_CLEAR_WITH_UNMAP:
/* Trim data clusters */
bs_batch_unmap_dev(batch, lba, lba_count);
break;
case BS_CLEAR_WITH_WRITE_ZEROES:
/* Write_zeroes to data clusters */
bs_batch_write_zeroes_dev(batch, lba, lba_count);
break;
case BS_CLEAR_WITH_NONE:
default:
break;
}

bs_batch_close(batch);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
// 相关结构体
// 同一个结构体3种别名表示3种用法
/* Use a sequence to submit a set of requests serially */
typedef struct spdk_bs_request_set spdk_bs_sequence_t; // 串行执行

/* Use a batch to submit a set of requests in parallel */
typedef struct spdk_bs_request_set spdk_bs_batch_t; // 并行执行

/* Use a user_op to queue a user operation for later execution */
typedef struct spdk_bs_request_set spdk_bs_user_op_t;

enum spdk_bs_cpl_type {
SPDK_BS_CPL_TYPE_NONE,
SPDK_BS_CPL_TYPE_BS_BASIC,
SPDK_BS_CPL_TYPE_BS_HANDLE,
SPDK_BS_CPL_TYPE_BLOB_BASIC,
SPDK_BS_CPL_TYPE_BLOBID,
SPDK_BS_CPL_TYPE_BLOB_HANDLE,
SPDK_BS_CPL_TYPE_NESTED_SEQUENCE,
};
struct spdk_bs_cpl { // 操作类型不同,需要的参数不同,回调函数也不同
enum spdk_bs_cpl_type type;
union {
struct {
spdk_bs_op_complete cb_fn;
void *cb_arg;
} bs_basic;

struct {
spdk_bs_op_with_handle_complete cb_fn;
void *cb_arg;
struct spdk_blob_store *bs;
} bs_handle;

struct {
spdk_blob_op_complete cb_fn;
void *cb_arg;
} blob_basic;

struct {
spdk_blob_op_with_id_complete cb_fn;
void *cb_arg;
spdk_blob_id blobid;
} blobid;

struct {
spdk_blob_op_with_handle_complete cb_fn;
void *cb_arg;
struct spdk_blob *blob;
void *esnap_ctx;
} blob_handle;

struct {
spdk_bs_nested_seq_complete cb_fn;
void *cb_arg;
spdk_bs_sequence_t *parent;
} nested_seq;
} u;
};

/* A generic request set. Can be a sequence, batch or a user_op. */
struct spdk_bs_request_set {
struct spdk_bs_cpl cpl;

int bserrno;

/*
* The blobstore's channel, obtained by blobstore consumers via
* spdk_bs_alloc_io_channel(). Used for IO to the blobstore.
*/
struct spdk_bs_channel *channel;
/*
* The channel used by the blobstore to perform IO on back_bs_dev. Unless the blob
* is an esnap clone, back_channel == spdk_io_channel_get_ctx(set->channel).
*/
struct spdk_io_channel *back_channel;

struct spdk_bs_dev_cb_args cb_args;

union { // 用途不同,参数也不同
struct {
spdk_bs_sequence_cpl cb_fn;
void *cb_arg;
} sequence;

struct {
uint32_t outstanding_ops;
uint32_t batch_closed;
spdk_bs_sequence_cpl cb_fn;
void *cb_arg;
} batch;

struct spdk_bs_user_op_args {
int type;
int iovcnt;
struct spdk_blob *blob;
uint64_t offset;
uint64_t length;
spdk_blob_op_complete cb_fn;
void *cb_arg;
void *payload; /* cast to iov for readv/writev */
} user_op;
} u;
/* Pointer to ext_io_opts passed by the user */
struct spdk_blob_ext_io_opts *ext_io_opts;
TAILQ_ENTRY(spdk_bs_request_set) link;
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
// 相关函数
outstanding_ops // 未完成任务数
batch_closed // 任务通道是否关闭


batch = bs_sequence_to_batch(seq, bs_init_trim_cpl, ctx);
spdk_bs_batch_t *
bs_sequence_to_batch(spdk_bs_sequence_t *seq, spdk_bs_sequence_cpl cb_fn, void *cb_arg)
{
struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq;

set->u.batch.cb_fn = cb_fn; // 任务全部完成的回调函数
set->u.batch.cb_arg = cb_arg;
set->u.batch.outstanding_ops = 0;
set->u.batch.batch_closed = 0;

set->cb_args.cb_fn = bs_batch_completion; // 每一个任务完成的回调函数

return set;
}
// 添加任务
/* Clear metadata space */
bs_batch_write_zeroes_dev(batch, 0, num_md_lba);
void
bs_batch_write_zeroes_dev(spdk_bs_batch_t *batch,
uint64_t lba, uint64_t lba_count)
{
struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch;
struct spdk_bs_channel *channel = set->channel;

SPDK_DEBUGLOG(blob_rw, "Zeroing %" PRIu64 " blocks at LBA %" PRIu64 "\n", lba_count, lba);

set->u.batch.outstanding_ops++; // 未完成任务数加一
channel->dev->write_zeroes(channel->dev, channel->dev_channel, lba, lba_count,
&set->cb_args);
}

/* Trim data clusters */
bs_batch_unmap_dev(batch, lba, lba_count);
void
bs_batch_unmap_dev(spdk_bs_batch_t *batch,
uint64_t lba, uint64_t lba_count)
{
struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch;
struct spdk_bs_channel *channel = set->channel;

SPDK_DEBUGLOG(blob_rw, "Unmapping %" PRIu64 " blocks at LBA %" PRIu64 "\n", lba_count,
lba);

set->u.batch.outstanding_ops++; // 未完成任务数加一
channel->dev->unmap(channel->dev, channel->dev_channel, lba, lba_count,
&set->cb_args);
}

// 任务添加完毕
bs_batch_close(batch);
void
bs_batch_close(spdk_bs_batch_t *batch)
{
struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch;

set->u.batch.batch_closed = 1; // 任务通道关闭

if (set->u.batch.outstanding_ops == 0) { // 此时任务全部完成
if (set->u.batch.cb_fn) { // 调用预先设置好的回调函数
set->cb_args.cb_fn = bs_sequence_completion;
set->u.batch.cb_fn((spdk_bs_sequence_t *)set, set->u.batch.cb_arg, set->bserrno);
} else {
bs_request_set_complete(set);
}
}
}

// 每一个任务执行完成的回调函数
// 调用路径(以bs_batch_write_zeroes_dev为例)
bs_batch_write_zeroes_dev
channel->dev->write_zeroes
bdev_blob_write_zeroes
bdev_blob_io_complete
cb_args->cb_fn // set->cb_args.cb_fn = bs_batch_completion;
static void
bs_batch_completion(struct spdk_io_channel *_channel,
void *cb_arg, int bserrno)
{
struct spdk_bs_request_set *set = cb_arg;

set->u.batch.outstanding_ops--; // 未完成任务数减一
if (bserrno != 0) {
set->bserrno = bserrno;
}

if (set->u.batch.outstanding_ops == 0 && set->u.batch.batch_closed) { // 任务全部完成且任务通道关闭
if (set->u.batch.cb_fn) { // 调用预先设置的回调函数
set->cb_args.cb_fn = bs_sequence_completion;
set->u.batch.cb_fn((spdk_bs_sequence_t *)set, set->u.batch.cb_arg, bserrno);
} else {
bs_request_set_complete(set);
}
}
}

// 全部任务完成回调函数
batch = bs_sequence_to_batch(seq, bs_init_trim_cpl, ctx);
static void
bs_init_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
{
struct spdk_bs_load_ctx *ctx = cb_arg;

/* Write super block */
bs_sequence_write_dev(seq, ctx->super, bs_page_to_lba(ctx->bs, 0),
bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)),
bs_init_persist_super_cpl, ctx);
}
// 超级块持久化完成回调函数
static void
bs_init_persist_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
{
struct spdk_bs_load_ctx *ctx = cb_arg;

ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters);
spdk_free(ctx->super);
free(ctx);

bs_sequence_finish(seq, bserrno);
}

// spdk_bit_pool与spdk_bit_array
struct spdk_bit_pool {
struct spdk_bit_array *array;
uint32_t lowest_free_bit;
uint32_t free_count;
};

// 继续看回调函数
bs_init_persist_super_cpl
bs_sequence_finish
bs_request_set_complete
bs_call_cpl
// spdk_bs_init中cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE
cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE;
cpl.u.bs_handle.cb_fn = cb_fn;
cpl.u.bs_handle.cb_arg = cb_arg;
cpl.u.bs_handle.bs = bs;
case SPDK_BS_CPL_TYPE_BS_HANDLE:
cpl->u.bs_handle.cb_fn(cpl->u.bs_handle.cb_arg, bserrno == 0 ? cpl->u.bs_handle.bs : NULL, bserrno);
bs_init_complete
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
// 使用gdb显示bs_init_complete调用栈
Thread 1 "reactor_0" hit Breakpoint 4, bs_init_complete (cb_arg=0xf19affa87bdd2000,
bs=0x7fffffffd5a0, bserrno=8192) at hello_blob.c:287
287 static void bs_init_complete(void *cb_arg, struct spdk_blob_store *bs, int bserrno) {
(gdb) bt
#0 bs_init_complete (cb_arg=0xf19affa87bdd2000, bs=0x7fffffffd5a0, bserrno=8192)
at hello_blob.c:287
#1 0x00005555555d6bb0 in bs_call_cpl (cpl=0x7fffffffd5e0, bserrno=0) at request.c:26
#2 0x00005555555d6d0d in bs_request_set_complete (set=0x555555d48c20) at request.c:63
#3 0x00005555555d77ec in bs_sequence_finish (seq=0x555555d48c20, bserrno=0) at request.c:295
#4 0x00005555555cd558 in bs_init_persist_super_cpl (seq=0x555555d48c20, cb_arg=0x555555ecfa20,
bserrno=0) at blobstore.c:5217
#5 0x00005555555d6d5e in bs_sequence_completion (channel=0x555555cd4480,
cb_arg=0x555555d48c20, bserrno=0) at request.c:72
#6 0x00005555555b3b8e in bdev_blob_io_complete (bdev_io=0x200013aa2700, success=true,
arg=0x555555d48c60) at blob_bdev.c:64
#7 0x000055555568e23c in _bdev_io_complete (ctx=0x200013aa2700) at bdev.c:6970
#8 0x000055555568e3dc in bdev_io_complete (ctx=0x200013aa2700) at bdev.c:7003
#9 0x000055555568e900 in spdk_bdev_io_complete (bdev_io=0x200013aa2700,
status=SPDK_BDEV_IO_STATUS_SUCCESS) at bdev.c:7131
#10 0x0000555555570c6c in malloc_done (ref=0x200013aa2ae0, status=0) at bdev_malloc.c:130
#11 0x0000555555570f05 in malloc_sequence_done (ctx=0x200013aa2ae0, status=0)
at bdev_malloc.c:225
#12 0x000055555569df8b in accel_sequence_complete (seq=0x555555e1dc40) at accel.c:1250
#13 0x000055555569f43c in accel_process_sequence (seq=0x555555e1dc40) at accel.c:1675
#14 0x000055555569f749 in accel_sequence_task_cb (cb_arg=0x555555e1dc40, status=0)
at accel.c:1750
#15 0x000055555569b67d in spdk_accel_task_complete (accel_task=0x555555d5ddb0, status=0)
at accel.c:292
#16 0x00005555556a4d66 in accel_comp_poll (arg=0x555555e75cc0) at accel_sw.c:525
#17 0x00005555556b6511 in thread_execute_poller (thread=0x555555ce17e0, poller=0x555555cd7b10)
at thread.c:946
#18 0x00005555556b6a95 in thread_poll (thread=0x555555ce17e0, max_msgs=0, now=3835790851557406)
at thread.c:1072
#19 0x00005555556b6d44 in spdk_thread_poll (thread=0x555555ce17e0, max_msgs=0,
now=3835790851557406) at thread.c:1156
#20 0x0000555555679572 in _reactor_run (reactor=0x555555ce14c0) at reactor.c:914
#21 0x0000555555679664 in reactor_run (arg=0x555555ce14c0) at reactor.c:952
#22 0x0000555555679aeb in spdk_reactors_start () at reactor.c:1068
#23 0x0000555555675c11 in spdk_app_start (opts_user=0x7fffffffde70,
start_fn=0x5555555705cc <hello_start>, arg1=0x555555cce4c0) at app.c:827
--Type <RET> for more, q to quit, c to continue without paging--
#24 0x00005555555707a7 in main (argc=2, argv=0x7fffffffe058) at hello_blob.c:390

blob 元数据格式

写数据流程

读数据流程

SPDK接口使用方式

代码来自:leveldb-direct,使用spdk底层驱动接口实现leveldb env接口

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
void write_complete(void *arg, const struct spdk_nvme_cpl *completion)  // 写回调函数
{
int* compl_status = static_cast<int*>(arg);
*compl_status = 1;
if (spdk_nvme_cpl_is_error(completion)) {
fprintf(stderr, "spdk write cpl error\n");
*compl_status = 2;
}
}
// 同步调用时chk_compl为nullptr,异步调用时chk_compl指向int变量compl_status_
void write_from_buf(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
void *buf, uint64_t lba, uint32_t cnt, int* chk_compl)
{
int rc;

if (chk_compl != nullptr) { // 异步调用,调用spdk_nvme_ns_cmd_write后不需要阻塞等待,直接返回
rc = spdk_nvme_ns_cmd_write(ns, qpair, buf, lba, cnt, write_complete, chk_compl, 0);
if (rc != 0) {
fprintf(stderr, "spdk cmd wirte failed\n");
exit(1);
}
return;
}
// 同步调用,调用spdk_nvme_ns_cmd_write接口后轮询CQ,直到执行完成
int l_chk_cpl = 0;
rc = spdk_nvme_ns_cmd_write(ns, qpair, buf, lba, cnt, write_complete, &l_chk_cpl, 0);
if (rc != 0) {
fprintf(stderr, "spdk write failed\n");
exit(1);
}
while (!l_chk_cpl)
spdk_nvme_qpair_process_completions(qpair, 0);
}

相关资料

SPDK: A Development Kit to Build High Performance Storage Applications

SPDK NVMe: An In-depth Look at its Architecture and Design

硬核虚拟化技术 SR-IOV的原理及探索
DPU和CPU互联的接口之争:Virtio还是SR-IOV?
SPDK概览
SPDK bdev详解