zns-spdk-env

开发步骤

相似代码:

leveldb-direct

ZapRAID论文代码

leveldb自定义env

代码仓库:

zns-spdk-env

参考文档:

SPDK Libraries

Linking SPDK applications with pkg-config

1 熟悉API使用

目标:使用spdk bdev接口清空ZNS SSD,写满若干个zone并读取验证,熟悉基本的API

如何编译?

拷贝examples/bdev/hello_world/hello_bdev.c代码,在外部编译

刚开始参考官方文档:SPDK Libraries

1
2
3
4
5
1 An application can link to the top level shared object library as follows:
gcc -o my_app ./my_app.c -lspdk -lspdk_env_dpdk -ldpdk
2 An application can link to only a subset of libraries by linking directly to the ones it relies on:
gcc -o my_app ./my_app.c -lpassthru_external -lspdk_event_bdev -lspdk_bdev -lspdk_bdev_malloc
-lspdk_log -lspdk_thread -lspdk_util -lspdk_event -lspdk_env_dpdk -ldpdk

显示/usr/bin/ld: cannot find -ldpdk

阅读leveldb-direct与ZapRAID相关编译文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# leveldb-direct
target_link_libraries(leveldb
spdk_nvme spdk_thread spdk_util spdk_log spdk_sock spdk_vmd spdk_env_dpdk
dpdk rte_bus_pci rte_bus_vdev rte_cmdline rte_compressdev rte_cryptodev rte_eal rte_ethdev
rte_fib rte_hash rte_kvargs rte_mbuf rte_mempool rte_mempool_bucket rte_mempool_ring rte_meter
rte_net rte_pci rte_rib rte_ring rte_vhost
numa dl uuid
)
# ZapRAID
target_link_libraries(zapraid -L${SPDK_DIR}/build/lib -L${SPDK_DIR}/dpdk/build/lib
-Wl,--whole-archive -Wl,--no-as-needed -Wl,-Bstatic
spdk_json spdk_jsonrpc spdk_rpc
spdk_log spdk_sock spdk_util spdk_trace
spdk_thread spdk_nvme
spdk_init spdk_env_dpdk_rpc spdk_event
-Wl,--no-whole-archive -Wl,-Bdynamic
spdk_env_dpdk rte_mempool rte_telemetry
rte_eal rte_kvargs rte_pci rte_bus_pci rte_ring rte_mempool_ring
pthread uuid rt isal dl)

发现都是采用链接rte_xxx的方式,不知道-ldpdk是否真的能用

在github上搜索spdk demo,找到了spdk-demo,参考它的方式编写Makefile

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
CC := g++

# All libs
ALL_SPDK_LIBS := spdk_accel_ioat spdk_blobfs spdk_jsonrpc \
spdk_accel_modules spdk_blob spdk_log \
spdk_accel spdk_conf spdk_lvol \
spdk_bdev_aio spdk_dpdklibs spdk_nbd \
spdk_bdev_delay spdk_env_dpdk spdk_net \
spdk_bdev_error spdk_env_dpdk_rpc spdk_notify \
spdk_bdev_ftl spdk_event_accel spdk_nvme \
spdk_bdev_gpt spdk_event_bdev spdk_nvmf \
spdk_bdev_lvol spdk_event_iscsi spdk_rpc \
spdk_bdev_malloc spdk_event_nbd spdk_scsi \
spdk_bdev_modules spdk_event_net spdk_sock_modules \
spdk_bdev_null spdk_event_nvmf spdk_sock \
spdk_bdev_nvme spdk_event spdk_sock_posix \
spdk_bdev_passthru spdk_event_scsi spdk_syslibs \
spdk_bdev spdk_event_sock spdk_thread \
spdk_bdev_raid spdk_event_vhost spdk_trace \
spdk_bdev_split spdk_event_vmd spdk_util \
spdk_bdev_virtio spdk_ftl \
spdk_bdev_zone_block spdk_ioat spdk_vhost \
spdk_blob_bdev spdk_iscsi spdk_virtio \
spdk_blobfs_bdev spdk_json spdk_vmd \
spdk_thread

# PKG-CONFIG
SPDK_BUILD_DIR=/home/hanshukai/import_libs/spdk/build/lib
SPDK_PKG_CONFIG_PATH=$(SPDK_BUILD_DIR)/pkgconfig
SPDK_LINK_FLAGS := $(shell PKG_CONFIG_PATH="$(SPDK_PKG_CONFIG_PATH)" pkg-config --cflags --libs $(ALL_SPDK_LIBS))
SPDK_SYSLIB_FLAGS := $(shell PKG_CONFIG_PATH="$(SPDK_PKG_CONFIG_PATH)" pkg-config --cflags --libs --static spdk_syslibs)


DPDK_BUILD_DIR=/home/hanshukai/import_libs/spdk/dpdk/build/lib
DPDK_PKG_CONFIG_PATH=$(DPDK_BUILD_DIR)/pkgconfig
DPDK_LINK_FLAGS := $(shell PKG_CONFIG_PATH="$(DPDK_PKG_CONFIG_PATH)" pkg-config --cflags --libs libdpdk)

LINK_FLAGS := -lpthread -lrt -lnuma -ldl -luuid -lm -lisal

all: reactor_demo bdev_demo

reactor_demo: clean
$(CC) --std=c++11 reactor_demo.cc -o reactor_demo $(LINK_FLAGS) -Wl,--whole-archive $(SPDK_LINK_FLAGS) $(DPDK_LINK_FLAGS) -Wl,--no-whole-archive $(SPDK_SYSLIB_FLAGS)

bdev_demo: clean
$(CC) --std=c++11 bdev_demo.cc -o bdev_demo $(LINK_FLAGS) -Wl,--whole-archive $(SPDK_LINK_FLAGS) $(DPDK_LINK_FLAGS) -Wl,--no-whole-archive $(SPDK_SYSLIB_FLAGS)

clean:
rm -rf reactor_demo bdev_demo

export:
export LD_LIBRARY_PATH=/home/hanshukai/import_libs/spdk/build/lib:/home/hanshukai/import_libs/spdk/dpdk/build/lib

pkconfig中的pc示例,简要了解pkconfig命令原理

1
2
3
4
5
6
7
8
# build/lib/pkgconfig/spdk_bdev.pc
Description: SPDK bdev library
Name: spdk_bdev
Version: 13.0
Libs: -L/root/xxx/spdk/build/lib -lspdk_bdev
Requires: spdk_accel spdk_log spdk_util spdk_thread spdk_json spdk_jsonrpc spdk_rpc spdk_notify spdk_trace spdk_dma spdk_bdev_modules
Libs.private:
Cflags: -I/root/xxx/spdk/build/include

Linking SPDK applications with pkg-config

相关编译选项

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
-Wl,--whole-archive

gcc
-Wl,<options> Pass comma-separated <options> on to the linker.
-llibrary 制定编译的时候使用的库
-Ldir 制定编译的时候,搜索库的路径。比如你自己的库,可以用它制定目录,不然编译器将只在标准库的目录找。这个dir就是目录的名称。

ld
--whole-archive Include all objects from following archives
--no-whole-archive Turn off --whole-archive

--as-needed Only set DT_NEEDED for following dynamic libs if used
--no-as-needed Always set DT_NEEDED for dynamic libraries mentioned on
-rpath PATH Set runtime shared library search path
-rpath-link PATH Set link time shared library search path

静态库编译

运行时出现以下问题

1
2
3
4
5
6
[2023-11-14 09:43:32.184966] Starting SPDK v23.09.1-pre git sha1 aa8059716 / DPDK 23.07.0 initialization...
[2023-11-14 09:43:32.185063] [ DPDK EAL parameters: hello_bdev --no-shconf -c 0x1 --huge-unlink --log-level=lib.eal:6 --log-level=lib.cryptodev:5 --log-level=user1:6 --iova-mode=pa --base-virtaddr=0x200000000000 --match-allocations --file-prefix=spdk_pid1610101 ]
TELEMETRY: No legacy callbacks, legacy socket not created
[2023-11-14 09:43:32.193459] app.c: 657:claim_cpu_cores: *ERROR*: Cannot create lock on core 0, probably process 1609753 has claimed it.
[2023-11-14 09:43:32.193488] app.c: 779:spdk_app_start: *ERROR*: Unable to acquire lock on assigned core mask - exiting.
[2023-11-14 09:43:32.193495] test.c: 308:main: *ERROR*: ERROR starting application
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
Thread 1 "test" hit Breakpoint 4, rte_mempool_set_ops_byname (mp=0x7fffffffd970, name=0x340048 <error: Cannot access memory at address 0x340048>, pool_config=0x7fffffffdb10) at ../lib/mempool/rte_mempool_ops.c:167
167 {
(gdb) bt
#0 rte_mempool_set_ops_byname (mp=0x7fffffffd970, name=0x340048 <error: Cannot access memory at address 0x340048>, pool_config=0x7fffffffdb10) at ../lib/mempool/rte_mempool_ops.c:167
#1 0x00007ffff7e7fe5c in rte_mempool_create (name=0x7fffffffdb10 "evtpool_1608924", n=262143, elt_size=32, cache_size=512, private_data_size=0, mp_init=0x0, mp_init_arg=0x0, obj_init=0x0, obj_init_arg=0x0, socket_id=-1, flags=0) at ../lib/mempool/rte_mempool.c:976
#2 0x0000555555569ed8 in spdk_mempool_create_ctor (name=0x7fffffffdb10 "evtpool_1608924", count=262143, ele_size=32, cache_size=512, socket_id=-1, obj_init=0x0, obj_init_arg=0x0) at env.c:182
#3 0x0000555555569f34 in spdk_mempool_create (name=0x7fffffffdb10 "evtpool_1608924", count=262143, ele_size=32, cache_size=18446744073709551615, socket_id=-1) at env.c:194
#4 0x000055555569edc8 in spdk_reactors_init (msg_mempool_size=262143) at reactor.c:217
#5 0x000055555569cf52 in spdk_app_start (opts_user=0x7fffffffdeb0, start_fn=0x555555569017 <hello_start>, arg1=0x7fffffffde50) at app.c:788
#6 0x0000555555569408 in main (argc=1, argv=0x7fffffffe0a8) at test.c:306
(gdb) s
168 struct rte_mempool_ops *ops = NULL;
(gdb) p name
$5 = 0x7ffff7e84176 "ring_mp_mc"
(gdb) p c
$6 = {sl = {locked = 0}, num_ops = 0, ops = {{name = '\000' <repeats 31 times>, alloc = 0x0, free = 0x0, enqueue = 0x0, dequeue = 0x0, get_count = 0x0, calc_mem_size = 0x0, populate = 0x0, get_info = 0x0, dequeue_contig_blocks = 0x0} <repeats 16 times>}}

通过gdb调试发现rte_mempool_ops_table为空

对比标准的hello_bdev程序显示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
gdb) bt
#0 rte_mempool_set_ops_byname (mp=0x7fffffffd970, name=0x1720048 <error: Cannot access memory at address 0x1720048>, pool_config=0x7fffffffdb10) at ../lib/mempool/rte_mempool_ops.c:167
#1 0x00005555557e3e31 in rte_mempool_create (name=0x7fffffffdb10 "evtpool_1609753", n=262143, elt_size=32, cache_size=512, private_data_size=0, mp_init=0x0, mp_init_arg=0x0, obj_init=0x0, obj_init_arg=0x0, socket_id=-1,
flags=0) at ../lib/mempool/rte_mempool.c:976
#2 0x0000555555657766 in spdk_mempool_create_ctor (name=0x7fffffffdb10 "evtpool_1609753", count=262143, ele_size=32, cache_size=512, socket_id=-1, obj_init=0x0, obj_init_arg=0x0) at env.c:182
#3 0x00005555556577c2 in spdk_mempool_create (name=0x7fffffffdb10 "evtpool_1609753", count=262143, ele_size=32, cache_size=18446744073709551615, socket_id=-1) at env.c:194
#4 0x0000555555677811 in spdk_reactors_init (msg_mempool_size=262143) at reactor.c:217
#5 0x000055555567599b in spdk_app_start (opts_user=0x7fffffffdeb0, start_fn=0x555555570317 <hello_start>, arg1=0x7fffffffde50) at app.c:788
#6 0x00005555555706f9 in main (argc=1, argv=0x7fffffffe098) at hello_bdev.c:306
(gdb) s
168 struct rte_mempool_ops *ops = NULL;
(gdb) p name
$1 = 0x555555a07a36 "ring_mp_mc"
(gdb) p rte_mempool_ops_table
$2 = {sl = {locked = 0}, num_ops = 6, ops = {{name = "ring_mp_mc", '\000' <repeats 21 times>, alloc = 0x5555557ef19e <common_ring_alloc>, free = 0x5555557ef232 <common_ring_free>,
# 以下省略

对应的源码文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
// rte_mempool_ring.c
/*
* The following 4 declarations of mempool ops structs address
* the need for the backward compatible mempool handlers for
* single/multi producers and single/multi consumers as dictated by the
* flags provided to the rte_mempool_create function
*/
static const struct rte_mempool_ops ops_mp_mc = {
.name = "ring_mp_mc",
.alloc = common_ring_alloc,
.free = common_ring_free,
.enqueue = common_ring_mp_enqueue,
.dequeue = common_ring_mc_dequeue,
.get_count = common_ring_get_count,
};

static const struct rte_mempool_ops ops_sp_sc = {
.name = "ring_sp_sc",
.alloc = common_ring_alloc,
.free = common_ring_free,
.enqueue = common_ring_sp_enqueue,
.dequeue = common_ring_sc_dequeue,
.get_count = common_ring_get_count,
};

static const struct rte_mempool_ops ops_mp_sc = {
.name = "ring_mp_sc",
.alloc = common_ring_alloc,
.free = common_ring_free,
.enqueue = common_ring_mp_enqueue,
.dequeue = common_ring_sc_dequeue,
.get_count = common_ring_get_count,
};

static const struct rte_mempool_ops ops_sp_mc = {
.name = "ring_sp_mc",
.alloc = common_ring_alloc,
.free = common_ring_free,
.enqueue = common_ring_sp_enqueue,
.dequeue = common_ring_mc_dequeue,
.get_count = common_ring_get_count,
};

/* ops for mempool with ring in MT_RTS sync mode */
static const struct rte_mempool_ops ops_mt_rts = {
.name = "ring_mt_rts",
.alloc = rts_ring_alloc,
.free = common_ring_free,
.enqueue = rts_ring_mp_enqueue,
.dequeue = rts_ring_mc_dequeue,
.get_count = common_ring_get_count,
};

/* ops for mempool with ring in MT_HTS sync mode */
static const struct rte_mempool_ops ops_mt_hts = {
.name = "ring_mt_hts",
.alloc = hts_ring_alloc,
.free = common_ring_free,
.enqueue = hts_ring_mp_enqueue,
.dequeue = hts_ring_mc_dequeue,
.get_count = common_ring_get_count,
};
RTE_MEMPOOL_REGISTER_OPS(ops_mp_mc);
RTE_MEMPOOL_REGISTER_OPS(ops_sp_sc);
RTE_MEMPOOL_REGISTER_OPS(ops_mp_sc);
RTE_MEMPOOL_REGISTER_OPS(ops_sp_mc);
RTE_MEMPOOL_REGISTER_OPS(ops_mt_rts);
RTE_MEMPOOL_REGISTER_OPS(ops_mt_hts);

通过nm ldd命令对比test与hello_bdev不同

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
 # test与标准hello_bdev对比 
nm test | grep ops_mt_hts
# 结果为空
nm hello_bdev | grep ops_mt_hts
000000000029b2c8 t mp_hdlr_init_ops_mt_hts
000000000052a3c0 d ops_mt_hts

ldd hello_bdev
linux-vdso.so.1 (0x00007ffe33ff6000)
libnuma.so.1 => /lib/x86_64-linux-gnu/libnuma.so.1 (0x00007f300a76b000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f300a765000)
librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007f300a75b000)
libuuid.so.1 => /lib/x86_64-linux-gnu/libuuid.so.1 (0x00007f300a752000)
libssl.so.1.1 => /lib/x86_64-linux-gnu/libssl.so.1.1 (0x00007f300a6bf000)
libcrypto.so.1.1 => /lib/x86_64-linux-gnu/libcrypto.so.1.1 (0x00007f300a3e9000)
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f300a298000)
libaio.so.1 => /lib/x86_64-linux-gnu/libaio.so.1 (0x00007f300a293000)
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f300a270000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f300a07e000)
/lib64/ld-linux-x86-64.so.2 (0x00007f300aeed000)

ldd test
linux-vdso.so.1 (0x00007ffe55687000)
librte_eal.so.23 => /root/xxx/spdk/dpdk/build/lib/librte_eal.so.23 (0x00007f19b437f000)
librte_mempool.so.23 => /root/xxx/spdk/dpdk/build/lib/librte_mempool.so.23 (0x00007f19b436f000)
librte_ring.so.23 => /root/xxx/spdk/dpdk/build/lib/librte_ring.so.23 (0x00007f19b436a000)
librte_bus_pci.so.23 => /root/xxx/spdk/dpdk/build/lib/librte_bus_pci.so.23 (0x00007f19b4356000)
librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007f19b434c000)
libuuid.so.1 => /lib/x86_64-linux-gnu/libuuid.so.1 (0x00007f19b4343000)
libssl.so.1.1 => /lib/x86_64-linux-gnu/libssl.so.1.1 (0x00007f19b42ae000)
libcrypto.so.1.1 => /lib/x86_64-linux-gnu/libcrypto.so.1.1 (0x00007f19b3fd8000)
libaio.so.1 => /lib/x86_64-linux-gnu/libaio.so.1 (0x00007f19b3fd3000)
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f19b3e84000)
libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007f19b3e69000)
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f19b3e46000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f19b3c52000)
/lib64/ld-linux-x86-64.so.2 (0x00007f19b4799000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f19b3c4c000)
libnuma.so.1 => /lib/x86_64-linux-gnu/libnuma.so.1 (0x00007f19b3c3f000)
librte_kvargs.so.23 => /root/xxx/spdk/dpdk/build/lib/librte_kvargs.so.23 (0x00007f19b3c3a000)
librte_telemetry.so.23 => /root/xxx/spdk/dpdk/build/lib/librte_telemetry.so.23 (0x00007f19b3c2e000)
librte_pci.so.23 => /root/xxx/spdk/dpdk/build/lib/librte_pci.so.23 (0x00007f19b3c27000)

结论:编译时未将rte_mempool_ring包含进来,dpdk相关库仍然采用动态库编译

加上-Wl,-Bstatic强制链接静态库(由于是g++,故加上-fpermissive允许void*类型转换)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
g++ test.c  -fpermissive -pthread -g  -o test  -Wl,--whole-archive -Wl,-Bstatic  -I/root/xxx/spdk/build/include -L/root/xxx/spdk/build/lib -L/root/xxx/spdk/dpdk/build/lib -L/root/xxx/spdk/build/lib -lspdk_env_dpdk -lrte_eal -lrte_mempool -lrte_ring -lrte_mbuf -lrte_bus_pci -lrte_pci -lrte_mempool_ring -lrte_telemetry -lrte_kvargs -lrte_rcu -lrte_power -lrte_ethdev -lrte_net -lrte_vhost -lrte_net -lrte_dmadev -lrte_cryptodev -lrte_hash -lspdk_bdev -lspdk_notify -lspdk_bdev_malloc -lspdk_bdev_null -lspdk_bdev_nvme -lspdk_bdev_passthru -lspdk_bdev_lvol -lspdk_bdev_raid -lspdk_accel -lspdk_accel_ioat -lspdk_ioat -lspdk_bdev_error -lspdk_bdev_gpt -lspdk_bdev_split -lspdk_bdev_delay -lspdk_bdev_zone_block -lspdk_blobfs_bdev -lspdk_blobfs -lspdk_blob_bdev -lspdk_lvol -lspdk_blob -lspdk_dma -lspdk_vmd -lspdk_nvme -lspdk_sock -lspdk_sock_posix -lspdk_bdev_aio -lspdk_bdev_ftl -lspdk_ftl -lspdk_bdev_virtio -lspdk_virtio -lspdk_thread -lspdk_trace -lspdk_rpc -lspdk_jsonrpc -lspdk_json -lspdk_util -lspdk_vfio_user -lspdk_log  -Wl,--no-whole-archive   -L/root/xxx/spdk/isa-l/.libs -L/root/xxx/spdk/isa-l-crypto/.libs -lisal -lisal_crypto -pthread -lrt -luuid -lssl -lcrypto -lm -laio -lnuma -ldl  
test.c:15:28: warning: ISO C++ forbids converting a string constant to ‘char*’ [-Wwrite-strings]
15 | static char *g_bdev_name = "Malloc0";
| ^~~~~~~~~
test.c: In function ‘void read_complete(spdk_bdev_io*, bool, void*)’:
test.c:62:42: warning: invalid conversion from ‘void*’ to ‘hello_context_t*’ [-fpermissive]
62 | struct hello_context_t *hello_context = cb_arg;
| ^~~~~~
| |
| void*
# 省略一部分输出
/usr/bin/ld: /root/xxx/spdk/dpdk/build/lib/librte_net.a(net_rte_arp.c.o): in function `rte_net_make_rarp_packet':
/root/xxx/spdk/dpdk/build-tmp/../lib/net/rte_arp.c:11: multiple definition of `rte_net_make_rarp_packet'; /root/xxx/spdk/dpdk/build/lib/librte_net.a(net_rte_arp.c.o):/root/xxx/spdk/dpdk/build-tmp/../lib/net/rte_arp.c:11: first defined here
# 省略一部分输出
/root/xxx/spdk/dpdk/build/lib/librte_net.a(net_crc_avx512.c.o):/root/xxx/spdk/dpdk/build-tmp/../lib/net/net_crc_avx512.c:414: first defined here
/usr/bin/ld: /root/xxx/spdk/dpdk/build/lib/librte_eal.a(eal_common_eal_common_options.c.o): in function `eal_dlopen':
/root/xxx/spdk/dpdk/build-tmp/../lib/eal/common/eal_common_options.c:466: warning: Using 'dlopen' in statically linked applications requires at runtime the shared libraries from the glibc version used for linking
/usr/bin/ld: cannot find -lgcc_s
/usr/bin/ld: /root/xxx/spdk/dpdk/build/lib/librte_eal.a(eal_common_eal_common_trace_utils.c.o): in function `trace_dir_default_path_get':
/root/xxx/spdk/dpdk/build-tmp/../lib/eal/common/eal_common_trace_utils.c:288: warning: Using 'getpwuid' in statically linked applications requires at runtime the shared libraries from the glibc version used for linking
/usr/bin/ld: /root/xxx/spdk/build/lib/libspdk_nvme.a(nvme_tcp.o): in function `nvme_tcp_parse_addr':
/root/xxx/spdk/lib/nvme/nvme_tcp.c:277: warning: Using 'getaddrinfo' in statically linked applications requires at runtime the shared libraries from the glibc version used for linking
/usr/bin/ld: /usr/lib/gcc/x86_64-linux-gnu/9/../../../x86_64-linux-gnu/libcrypto.a(b_sock.o): in function `BIO_gethostbyname':
(.text+0x75): warning: Using 'gethostbyname' in statically linked applications requires at runtime the shared libraries from the glibc version used for linking
/usr/bin/ld: cannot find -lgcc_s
collect2: error: ld returned 1 exit status
make: *** [Makefile:9: test] Error 1

编译时出现重定义问题,未能解决,放弃静态库编译方式

动态库编译

加上–with-shared选项重新编译spdk库

ubuntu下直接在/etc/ld.so.conf.d下加入spdk.conf,添加spdk与dpdk动态库搜索路径

1
2
/root/xxx/spdk/dpdk/build/lib
/root/xxx/spdk/build/lib

使用ldconfig更新,并使用ldconfig -v查看路径是否添加成功

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
/root/xxx/spdk/dpdk/build/lib:
librte_compressdev.so.23 -> librte_compressdev.so.23.2
librte_mempool_ring.so.23 -> librte_mempool_ring.so.23.2
librte_eal.so.23 -> librte_eal.so.23.2
librte_kvargs.so.23 -> librte_kvargs.so.23.2
librte_rcu.so.23 -> librte_rcu.so.23.2
librte_mempool.so.23 -> librte_mempool.so.23.2
librte_mbuf.so.23 -> librte_mbuf.so.23.2
librte_dmadev.so.23 -> librte_dmadev.so.23.2
librte_cryptodev.so.23 -> librte_cryptodev.so.23.2
librte_bus_vdev.so.23 -> librte_bus_vdev.so.23.2
librte_ethdev.so.23 -> librte_ethdev.so.23.2
librte_pci.so.23 -> librte_pci.so.23.2
librte_bus_pci.so.23 -> librte_bus_pci.so.23.2
librte_vhost.so.23 -> librte_vhost.so.23.2
librte_telemetry.so.23 -> librte_telemetry.so.23.2
librte_meter.so.23 -> librte_meter.so.23.2
librte_timer.so.23 -> librte_timer.so.23.2
librte_net.so.23 -> librte_net.so.23.2
librte_security.so.23 -> librte_security.so.23.2
librte_reorder.so.23 -> librte_reorder.so.23.2
librte_hash.so.23 -> librte_hash.so.23.2
librte_ring.so.23 -> librte_ring.so.23.2
librte_power.so.23 -> librte_power.so.23.2
librte_cmdline.so.23 -> librte_cmdline.so.23.2
/root/xxx/spdk/build/lib:
libspdk_conf.so.5.0 -> libspdk_conf.so.5.0
libspdk_vfio_user.so.4.0 -> libspdk_vfio_user.so.4.0
libspdk_blobfs.so.9.0 -> libspdk_blobfs.so.9.0
libspdk_bdev_gpt.so.5.0 -> libspdk_bdev_gpt.so.5.0
libspdk_event_sock.so.4.0 -> libspdk_event_sock.so.4.0
libspdk_nvme.so.11.0 -> libspdk_nvme.so.11.0
libspdk_nvmf.so.16.0 -> libspdk_nvmf.so.16.0
# 省略部分输出

运行时出现问题

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
./test -c zns.json   -b Nvme0n1
[2023-11-14 11:05:37.380098] Starting SPDK v23.09.1-pre git sha1 aa8059716 / DPDK 23.07.0 initialization...
[2023-11-14 11:05:37.380137] [ DPDK EAL parameters: hello_bdev --no-shconf -c 0x1 --huge-unlink --log-level=lib.eal:6 --log-level=lib.cryptodev:5 --log-level=user1:6 --iova-mode=pa --base-virtaddr=0x200000000000 --match-allocations --file-prefix=spdk_pid1653437 ]
TELEMETRY: No legacy callbacks, legacy socket not created
[2023-11-14 11:05:37.386954] app.c: 786:spdk_app_start: *NOTICE*: Total cores available: 1
[2023-11-14 11:05:37.413012] reactor.c: 937:reactor_run: *NOTICE*: Reactor started on core 0
[2023-11-14 11:05:37.614963] thread.c:2277:spdk_get_io_channel: *ERROR*: could not find io_device 0x7f37758c51e0
[2023-11-14 11:05:37.614980] thread.c:2339:spdk_get_io_channel: *ERROR*: could not create io_channel for io_device Nvme0 (0x55fafe79e6d0): Operation not permitted (rc=-1)
[2023-11-14 11:05:37.614984] bdev_nvme.c: 651:_bdev_nvme_add_io_path: *ERROR*: Failed to alloc io_channel.
[2023-11-14 11:05:37.614988] thread.c:2339:spdk_get_io_channel: *ERROR*: could not create io_channel for io_device Nvme0n1 (0x55fafe79f7d0): Cannot allocate memory (rc=-12)
[2023-11-14 11:05:37.614991] thread.c:2339:spdk_get_io_channel: *ERROR*: could not create io_channel for io_device bdev_Nvme0n1 (0x55fafe79f7d1): Operation not permitted (rc=-1)
[2023-11-14 11:05:37.614994] blobstore.c:3334:bs_channel_create: *ERROR*: Failed to create device channel.
[2023-11-14 11:05:37.614997] thread.c:2339:spdk_get_io_channel: *ERROR*: could not create io_channel for io_device blobstore (0x55fafe7a0380): Operation not permitted (rc=-1)
[2023-11-14 11:05:37.615000] blobstore.c:5807:bs_register_md_thread: *ERROR*: Failed to get IO channel.
[2023-11-14 11:05:37.615040] thread.c:2277:spdk_get_io_channel: *ERROR*: could not find io_device 0x7f37758c51e0
[2023-11-14 11:05:37.615045] thread.c:2339:spdk_get_io_channel: *ERROR*: could not create io_channel for io_device Nvme0 (0x55fafe79e6d0): Operation not permitted (rc=-1)
[2023-11-14 11:05:37.615048] bdev_nvme.c: 651:_bdev_nvme_add_io_path: *ERROR*: Failed to alloc io_channel.
[2023-11-14 11:05:37.615051] thread.c:2339:spdk_get_io_channel: *ERROR*: could not create io_channel for io_device Nvme0n1 (0x55fafe79f7d0): Cannot allocate memory (rc=-12)
[2023-11-14 11:05:37.615054] thread.c:2339:spdk_get_io_channel: *ERROR*: could not create io_channel for io_device bdev_Nvme0n1 (0x55fafe79f7d1): Operation not permitted (rc=-1)
[2023-11-14 11:05:37.615057] vbdev_gpt.c: 531:vbdev_gpt_read_gpt: *ERROR*: Failed to get an io_channel.
[2023-11-14 11:05:37.615063] vbdev_gpt.c: 584:vbdev_gpt_examine: *ERROR*: Failed to read info from bdev Nvme0n1
[2023-11-14 11:05:37.617497] test.c: 222:hello_start: *NOTICE*: Successfully started the application
[2023-11-14 11:05:37.617504] test.c: 231:hello_start: *NOTICE*: Opening the bdev Nvme0n1
[2023-11-14 11:05:37.617507] test.c: 244:hello_start: *NOTICE*: Opening io channel
[2023-11-14 11:05:37.617511] thread.c:2277:spdk_get_io_channel: *ERROR*: could not find io_device 0x7f37758c51e0
[2023-11-14 11:05:37.617514] thread.c:2339:spdk_get_io_channel: *ERROR*: could not create io_channel for io_device Nvme0 (0x55fafe79e6d0): Operation not permitted (rc=-1)
[2023-11-14 11:05:37.617517] bdev_nvme.c: 651:_bdev_nvme_add_io_path: *ERROR*: Failed to alloc io_channel.
[2023-11-14 11:05:37.617519] thread.c:2339:spdk_get_io_channel: *ERROR*: could not create io_channel for io_device Nvme0n1 (0x55fafe79f7d0): Cannot allocate memory (rc=-12)
[2023-11-14 11:05:37.617522] thread.c:2339:spdk_get_io_channel: *ERROR*: could not create io_channel for io_device bdev_Nvme0n1 (0x55fafe79f7d1): Operation not permitted (rc=-1)
[2023-11-14 11:05:37.617525] test.c: 248:hello_start: *ERROR*: Could not create bdev I/O channel!!
[2023-11-14 11:05:37.617528] app.c: 898:spdk_app_stop: *WARNING*: spdk_app_stop'd on non-zero
[2023-11-14 11:05:42.617801] thread.c: 628:thread_exit: *ERROR*: thread app_thread got timeout, and move it to the exited state forcefully
[2023-11-14 11:05:42.617818] test.c: 308:main: *ERROR*: ERROR starting application
[2023-11-14 11:05:42.617856] thread.c: 464:spdk_thread_lib_fini: *ERROR*: io_device Nvme0 not unregistered
[2023-11-14 11:05:42.617860] thread.c: 464:spdk_thread_lib_fini: *ERROR*: io_device Nvme0n1 not unregistered
[2023-11-14 11:05:42.617863] thread.c: 464:spdk_thread_lib_fini: *ERROR*: io_device bdev_Nvme0n1 not unregistered
[2023-11-14 11:05:42.617867] thread.c: 379:_free_thread: *WARNING*: timed_poller bdev_nvme_poll_adminq still registered at thread exit
[2023-11-14 11:05:42.649495] pci.c: 353:pci_env_fini: *ERROR*: Device 0000:01:00.0 is still attached at shutdown!

问题原因:nvme_if模块未能成功加载,g_nvme_bdev_ctrlrs未注册

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
static int
bdev_nvme_library_init(void)
{
g_bdev_nvme_init_thread = spdk_get_thread();

spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb,
bdev_nvme_destroy_poll_group_cb,
sizeof(struct nvme_poll_group), "nvme_poll_groups");

return 0;
}

static struct spdk_bdev_module nvme_if = {
.name = "nvme",
.async_fini = true,
.module_init = bdev_nvme_library_init,
.module_fini = bdev_nvme_library_fini,
.config_json = bdev_nvme_config_json,
.get_ctx_size = bdev_nvme_get_ctx_size,

};
SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)

对比标准hello_bdev调用栈

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
(gdb) bt
#0 bdev_nvme_library_init () at bdev_nvme.c:6805
#1 0x00007ffff7d375b8 in bdev_modules_init () at bdev.c:2059
#2 0x00007ffff7d37845 in spdk_bdev_initialize (cb_fn=0x7ffff7d5e1d9 <bdev_initialize_complete>, cb_arg=0x0) at bdev.c:2128
#3 0x00007ffff7d5e212 in bdev_subsystem_initialize () at bdev.c:24
#4 0x00007ffff7c0c3fa in spdk_subsystem_init_next (rc=0) at subsystem.c:166
#5 0x00007ffff7ce11d7 in accel_subsystem_initialize () at accel.c:20
#6 0x00007ffff7c0c3fa in spdk_subsystem_init_next (rc=0) at subsystem.c:166
#7 0x00007ffff7c365d8 in vmd_subsystem_init () at vmd.c:63
#8 0x00007ffff7c0c3fa in spdk_subsystem_init_next (rc=0) at subsystem.c:166
#9 0x00007ffff7c2218b in sock_subsystem_init () at sock.c:13
#10 0x00007ffff7c0c3fa in spdk_subsystem_init_next (rc=0) at subsystem.c:166
#11 0x00007ffff7c143e6 in iobuf_subsystem_initialize () at iobuf.c:22
#12 0x00007ffff7c0c3fa in spdk_subsystem_init_next (rc=0) at subsystem.c:166
#13 0x00007ffff7c0c572 in spdk_subsystem_init (cb_fn=0x7ffff7c0b6ea <subsystem_init_done>, cb_arg=0x555555583700) at subsystem.c:199
#14 0x00007ffff7c0b836 in app_json_config_load_subsystem (_ctx=0x555555583700) at json_config.c:471
#15 0x00007ffff7bfa0d2 in msg_queue_run_batch (thread=0x555555583370, max_msgs=8) at thread.c:841
#16 0x00007ffff7bfaa2b in thread_poll (thread=0x555555583370, max_msgs=0, now=6000572509297613) at thread.c:1063
#17 0x00007ffff7bfad3a in spdk_thread_poll (thread=0x555555583370, max_msgs=0, now=6000572509297613) at thread.c:1156
#18 0x00007ffff7d6e3da in _reactor_run (reactor=0x555555585440) at reactor.c:914
#19 0x00007ffff7d6e4cc in reactor_run (arg=0x555555585440) at reactor.c:952
#20 0x00007ffff7d6e953 in spdk_reactors_start () at reactor.c:1068
#21 0x00007ffff7d6aa79 in spdk_app_start (opts_user=0x7fffffffde50, start_fn=0x555555556be7 <hello_start>, arg1=0x7fffffffddf0) at app.c:827
#22 0x0000555555556fc9 in main (argc=5, argv=0x7fffffffe038) at hello_bdev.c:306

(gdb) p g_subsystems
$2 = {tqh_first = 0x7ffff7c17020 <g_subsystem_iobuf>, tqh_last = 0x7ffff7d61040 <g_spdk_subsystem_bdev+32>}

test程序

1
2
(gdb) p g_subsystems
$2 = {tqh_first = 0x0, tqh_last = 0x7ffff774a0c0 <g_subsystems>}

追踪到spdk_add_subsystem

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
void
spdk_add_subsystem(struct spdk_subsystem *subsystem)
{
TAILQ_INSERT_TAIL(&g_subsystems, subsystem, tailq);
}

/**
* \brief Register a new subsystem
*/
#define SPDK_SUBSYSTEM_REGISTER(_name) \
__attribute__((constructor)) static void _name ## _register(void) \
{ \
spdk_add_subsystem(&_name); \
}

static struct spdk_subsystem g_spdk_subsystem_bdev = {
.name = "bdev",
.init = bdev_subsystem_initialize,
.fini = bdev_subsystem_finish,
.write_config_json = bdev_subsystem_config_json,
};

SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_bdev);
SPDK_SUBSYSTEM_DEPEND(bdev, accel)
SPDK_SUBSYSTEM_DEPEND(bdev, vmd)
SPDK_SUBSYSTEM_DEPEND(bdev, sock)
SPDK_SUBSYSTEM_DEPEND(bdev, iobuf)

通过对比hello_bdev与test链接动态库的区别

1
2
3
4
5
ldd test |  wc -l
75

ldd hello_bdev | wc -l
82

发现test少链接了一些动态库(例如一些event_xxx)的库

1
2
3
4
5
6
7
8
9
10
11
libspdk_event_bdev.so.5.0 => /root/xxx/spdk/build/lib/libspdk_event_bdev.so.5.0 (0x00007f95a6bd0000)
libspdk_bdev.so.13.0 => /root/xxx/spdk/build/lib/libspdk_bdev.so.13.0 (0x00007f95a6b9d000)
libspdk_notify.so.5.0 => /root/xxx/spdk/build/lib/libspdk_notify.so.5.0 (0x00007f95a6b58000)
libspdk_event_accel.so.5.0 => /root/xxx/spdk/build/lib/libspdk_event_accel.so.5.0 (0x00007f95a6b53000)
libspdk_accel.so.13.0 => /root/xxx/spdk/build/lib/libspdk_accel.so.13.0 (0x00007f95a6ab4000)
libspdk_dma.so.3.0 => /root/xxx/spdk/build/lib/libspdk_dma.so.3.0 (0x00007f95a6aad000)
libspdk_event_vmd.so.5.0 => /root/xxx/spdk/build/lib/libspdk_event_vmd.so.5.0 (0x00007f95a6aa8000)
libspdk_vmd.so.5.0 => /root/xxx/spdk/build/lib/libspdk_vmd.so.5.0 (0x00007f95a6a99000)
libspdk_event_sock.so.4.0 => /root/xxx/spdk/build/lib/libspdk_event_sock.so.4.0 (0x00007f95a6a94000)
libspdk_sock.so.8.0 => /root/xxx/spdk/build/lib/libspdk_sock.so.8.0 (0x00007f95a6a8b000)
libspdk_event_iobuf.so.2.0 => /root/xxx/spdk/build/lib/libspdk_event_iobuf.so.2.0 (0x00007f95a6a86000)

故像spdk-demo中一样,把所有库加上(现在意识到了这种做法聪明之处,之前我只加了几项,因为我以为其他的会一并加入,但这样做应该漏了几项)

感觉可以将spdk/build/lib/pkgconfig中所有pc名都加上,以防万一

1
2
3
4
5
6
7
8
9
10
11
ls
spdk_accel_ioat.pc spdk_bdev_modules.pc spdk_blobfs_bdev.pc spdk_event_iobuf.pc spdk_event_vmd.pc spdk_notify.pc spdk_sock.pc spdk_vhost.pc
spdk_accel_modules.pc spdk_bdev_null.pc spdk_blobfs.pc spdk_event_iscsi.pc spdk_ftl.pc spdk_nvme.pc spdk_sock_posix.pc spdk_virtio.pc
spdk_accel.pc spdk_bdev_nvme.pc spdk_blob.pc spdk_event_nbd.pc spdk_init.pc spdk_nvmf.pc spdk_syslibs.pc spdk_vmd.pc
spdk_bdev_aio.pc spdk_bdev_passthru.pc spdk_conf.pc spdk_event_nvmf.pc spdk_ioat.pc spdk_rpc.pc spdk_thread.pc tmp/
spdk_bdev_delay.pc spdk_bdev.pc spdk_dma.pc spdk_event.pc spdk_iscsi.pc spdk_scheduler_dpdk_governor.pc spdk_trace_parser.pc
spdk_bdev_error.pc spdk_bdev_raid.pc spdk_dpdklibs.pc spdk_event_scheduler.pc spdk_json.pc spdk_scheduler_dynamic.pc spdk_trace.pc
spdk_bdev_ftl.pc spdk_bdev_split.pc spdk_env_dpdk.pc spdk_event_scsi.pc spdk_jsonrpc.pc spdk_scheduler_gscheduler.pc spdk_util.pc
spdk_bdev_gpt.pc spdk_bdev_virtio.pc spdk_env_dpdk_rpc.pc spdk_event_sock.pc spdk_log.pc spdk_scheduler_modules.pc spdk_ut_mock.pc
spdk_bdev_lvol.pc spdk_bdev_zone_block.pc spdk_event_accel.pc spdk_event_vhost_blk.pc spdk_lvol.pc spdk_scsi.pc spdk_ut.pc
spdk_bdev_malloc.pc spdk_blob_bdev.pc spdk_event_bdev.pc spdk_event_vhost_scsi.pc spdk_nbd.pc spdk_sock_modules.pc spdk_vfio_user.pc

最终成果

makefile

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
CC := g++
PKG_CONFIG_PATH = /root/xxx/spdk/build/lib/pkgconfig
ALL_SPDK_LIBS := spdk_accel_ioat spdk_blobfs spdk_jsonrpc \
spdk_accel_modules spdk_blob spdk_log \
spdk_accel spdk_conf spdk_lvol \
spdk_bdev_aio spdk_dpdklibs spdk_nbd \
spdk_bdev_delay spdk_env_dpdk \
spdk_bdev_error spdk_env_dpdk_rpc spdk_notify \
spdk_bdev_ftl spdk_event_accel spdk_nvme \
spdk_bdev_gpt spdk_event_bdev spdk_nvmf \
spdk_bdev_lvol spdk_event_iscsi spdk_rpc \
spdk_bdev_malloc spdk_event_nbd spdk_scsi \
spdk_bdev_modules spdk_sock_modules \
spdk_bdev_null spdk_event_nvmf spdk_sock \
spdk_bdev_nvme spdk_event spdk_sock_posix \
spdk_bdev_passthru spdk_event_scsi spdk_syslibs \
spdk_bdev spdk_event_sock spdk_thread \
spdk_bdev_raid spdk_trace \
spdk_bdev_split spdk_event_vmd spdk_util \
spdk_bdev_virtio spdk_ftl \
spdk_bdev_zone_block spdk_ioat spdk_vhost \
spdk_blob_bdev spdk_iscsi spdk_virtio \
spdk_blobfs_bdev spdk_json spdk_vmd \
spdk_thread
SPDK_LIB := $(shell PKG_CONFIG_PATH="$(PKG_CONFIG_PATH)" pkg-config --cflags --libs $(ALL_SPDK_LIBS))



test:
$(CC) test.c -fpermissive -pthread -g -o test -Wl,--no-as-needed $(SPDK_LIB) -Wl,--as-needed

clean:
rm test

编译输出

1
2
3
4
5
6
7
8
9
10
11
g++ test.c  -fpermissive -pthread -g  -o test  -Wl,--no-as-needed  -I/root/xxx/spdk/build/include -L/root/xxx/spdk/build/lib -L/root/xxx/spdk/dpdk/build/lib -L/root/xxx/spdk/build/lib -lspdk_env_dpdk -lrte_eal -lrte_mempool -lrte_ring -lrte_mbuf -lrte_bus_pci -lrte_pci -lrte_mempool_ring -lrte_telemetry -lrte_kvargs -lrte_rcu -lrte_power -lrte_ethdev -lrte_net -lrte_vhost -lrte_net -lrte_dmadev -lrte_cryptodev -lrte_hash -lspdk_event_iscsi -lspdk_event_nbd -lspdk_nbd -lspdk_event_nvmf -lspdk_nvmf -lspdk_event_scheduler -lspdk_event -lspdk_env_dpdk_rpc -lspdk_event_scsi -lspdk_event_bdev -lspdk_event_accel -lspdk_event_iobuf -lspdk_event_sock -lspdk_event_vmd -lspdk_init -lspdk_vhost -lspdk_iscsi -lspdk_conf -lspdk_scsi -lspdk_blobfs_bdev -lspdk_blob_bdev -lspdk_bdev -lspdk_notify -lspdk_bdev_malloc -lspdk_bdev_null -lspdk_bdev_nvme -lspdk_bdev_passthru -lspdk_bdev_lvol -lspdk_bdev_raid -lspdk_accel -lspdk_accel_ioat -lspdk_ioat -lspdk_bdev_error -lspdk_bdev_gpt -lspdk_bdev_split -lspdk_bdev_delay -lspdk_bdev_zone_block -lspdk_lvol -lspdk_nvme -lspdk_sock -lspdk_sock_posix -lspdk_bdev_aio -lspdk_bdev_ftl -lspdk_ftl -lspdk_bdev_virtio -lspdk_virtio -lspdk_vfio_user -lspdk_blobfs -lspdk_blob -lspdk_dma -lspdk_vmd -lspdk_thread -lspdk_trace -lspdk_rpc -lspdk_jsonrpc -lspdk_json -lspdk_util -lspdk_log  -Wl,--as-needed
test.c:15:28: warning: ISO C++ forbids converting a string constant to ‘char*’ [-Wwrite-strings]
15 | static char *g_bdev_name = "Malloc0";
| ^~~~~~~~~
test.c: In function ‘void read_complete(spdk_bdev_io*, bool, void*)’:
test.c:62:42: warning: invalid conversion from ‘void*’ to ‘hello_context_t*’ [-fpermissive]
62 | struct hello_context_t *hello_context = cb_arg;
| ^~~~~~
| |
| void*
# 省略部分输出

运行时输出

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
root@osd-node0 ~/l/s/zns-api-test (v23.09.x)# ./test -c zns.json   -b Nvme0n1
[2023-11-14 15:54:54.808886] Starting SPDK v23.09.1-pre git sha1 aa8059716 / DPDK 23.07.0 initialization...
[2023-11-14 15:54:54.808932] [ DPDK EAL parameters: hello_bdev --no-shconf -c 0x1 --huge-unlink --log-level=lib.eal:6 --log-level=lib.cryptodev:5 --log-level=user1:6 --iova-mode=pa --base-virtaddr=0x200000000000 --match-allocations --file-prefix=spdk_pid1673133 ]
TELEMETRY: No legacy callbacks, legacy socket not created
[2023-11-14 15:54:54.815662] app.c: 786:spdk_app_start: *NOTICE*: Total cores available: 1
[2023-11-14 15:54:54.841394] reactor.c: 937:reactor_run: *NOTICE*: Reactor started on core 0
[2023-11-14 15:54:55.129643] test.c: 222:hello_start: *NOTICE*: Successfully started the application
[2023-11-14 15:54:55.129663] test.c: 231:hello_start: *NOTICE*: Opening the bdev Nvme0n1
[2023-11-14 15:54:55.129668] test.c: 244:hello_start: *NOTICE*: Opening io channel
[2023-11-14 15:54:55.149196] test.c: 138:hello_write: *NOTICE*: Writing to the bdev
[2023-11-14 15:54:55.149228] test.c: 117:write_complete: *NOTICE*: bdev io write completed successfully
[2023-11-14 15:54:55.149231] test.c: 84:hello_read: *NOTICE*: Reading io
[2023-11-14 15:54:55.149246] test.c: 65:read_complete: *NOTICE*: Read string from bdev : Hello World!

[2023-11-14 15:54:55.149250] test.c: 74:read_complete: *NOTICE*: Stopping app

弄了好久终于成功了!!!

番外

Note that SPDK libraries use constructor functions liberally, so you must surround the library list with extra linker options to ensure these functions are not dropped from the resulting application binary.

SPDK Libraries文档中这句话的意思是什么呢? 按理来说主程序使用库头文件中的函数,静态库/动态库为其提供对应函数的定义,也就是说主程序用到哪些函数,库提供这些函数的全部递归实现,完全未涉及到的函数就不需要链接进来,那么为什么需要设置–whole-archive 或者–no-as-needed 链接选项呢?

1
2
3
ld
--whole-archive Include all objects from following archives
--no-as-needed Always set DT_NEEDED for dynamic libraries mentioned on

以module/event/subsystems/bdev/bdev.c文件为例分析原因

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
// module/event/subsystems/bdev/bdev.c
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright (C) 2016 Intel Corporation.
* All rights reserved.
*/

#include "spdk/stdinc.h"

#include "spdk/bdev.h"
#include "spdk/env.h"
#include "spdk/thread.h"

#include "spdk_internal/init.h"
#include "spdk/env.h"

static void
bdev_initialize_complete(void *cb_arg, int rc)
{
spdk_subsystem_init_next(rc);
}

static void
bdev_subsystem_initialize(void)
{
spdk_bdev_initialize(bdev_initialize_complete, NULL);
}

static void
bdev_subsystem_finish_done(void *cb_arg)
{
spdk_subsystem_fini_next();
}

static void
bdev_subsystem_finish(void)
{
spdk_bdev_finish(bdev_subsystem_finish_done, NULL);
}

static void
bdev_subsystem_config_json(struct spdk_json_write_ctx *w)
{
spdk_bdev_subsystem_config_json(w);
}

static struct spdk_subsystem g_spdk_subsystem_bdev = {
.name = "bdev",
.init = bdev_subsystem_initialize,
.fini = bdev_subsystem_finish,
.write_config_json = bdev_subsystem_config_json,
};

SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_bdev);
SPDK_SUBSYSTEM_DEPEND(bdev, accel)
SPDK_SUBSYSTEM_DEPEND(bdev, vmd)
SPDK_SUBSYSTEM_DEPEND(bdev, sock)
SPDK_SUBSYSTEM_DEPEND(bdev, iobuf)

这些函数只在本文件被引用,故主程序不可能直接/间接引用到它,g_spdk_subsystem_bdev加上了static修饰符,只在本文件可见,故关键点在于SPDK_SUBSYSTEM_REGISTER宏

1
2
3
4
5
6
7
8
/**
* \brief Register a new subsystem
*/
#define SPDK_SUBSYSTEM_REGISTER(_name) \
__attribute__((constructor)) static void _name ## _register(void) \
{ \
spdk_add_subsystem(&_name); \
}

由于主程序无法引用该文件的内容,故它的执行与_attribute_((constructor))有关

attribute((constructor))用法解析

constructor参数让系统执行main()函数之前调用函数(被__attribute__((constructor))修饰的函数).同理, destructor让系统在main()函数退出或者调用了exit()之后,调用我们的函数.带有这些修饰属性的函数,对于我们初始化一些在程序中使用的数据非常有用.

How exactly does attribute((constructor)) work?

So, the way the constructors and destructors work is that the shared object file contains special sections (.ctors and .dtors on ELF) which contain references to the functions marked with the constructor and destructor attributes, respectively. When the library is loaded/unloaded the dynamic loader program (ld.so or somesuch) checks whether such sections exist, and if so, calls the functions referenced therein.

Come to think of it, there is probably some similar magic in the normal static linker so that the same code is run on startup/shutdown regardless if the user chooses static or dynamic linking.

结论:可以看出bdev.c是一个自成一体的文件,包含或不包含都不会对主程序产生影响,编译与链接都不会有问题,但运行时主程序逻辑上预设这些模块应该被导入/加载,故执行出错。

SPDK Libraries

Linking SPDK applications with pkg-config

lib目录内容

module目录内容

通过spdk_event_xxx库导入问题,有点懂了为什么module是instance了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# 以上分析文件为module/event/subsystems/bdev/bdev.c
# module/event/subsystems/bdev/Makefile文件内容
# SPDX-License-Identifier: BSD-3-Clause
# Copyright (C) 2015 Intel Corporation.
# All rights reserved.
#

SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..)
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk

SO_VER := 5
SO_MINOR := 0

C_SRCS = bdev.c
LIBNAME = event_bdev

SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map

include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk

demo编写

zone id误解

zone_id就是zslba,而不是0,1,2,3,这种序号

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/**
* Get the first logical block of a zone (known as zone_id or zslba)
* for a given offset.
*
* \param bdev Block device to query.
* \param offset_blocks The offset, in blocks, from the start of the block device.
* \return The zone_id (also known as zslba) for the given offset.
*/
uint64_t
spdk_bdev_get_zone_id(const struct spdk_bdev *bdev, uint64_t offset_blocks)
{
uint64_t zslba;

if (spdk_likely(spdk_u64_is_pow2(bdev->zone_size))) {
uint64_t zone_mask = bdev->zone_size - 1;
zslba = offset_blocks & ~zone_mask;
} else {
/* integer division */
zslba = (offset_blocks / bdev->zone_size) * bdev->zone_size;
}

return zslba;
}

active zone限制

注意不能超过 active zone限制(同时操作的zone数目)

1
2
3
4
5
6
[2023-11-15 11:43:48.482631] nvme_qpair.c: 255:nvme_io_qpair_print_command: *NOTICE*: IO COMMAND (7d) sqid:1 cid:61 nsid:1
[2023-11-15 11:43:48.482636] nvme_qpair.c: 474:spdk_nvme_print_completion: *NOTICE*: TOO MANY ACTIVE ZONES (01/bd) qid:1 cid:61 cdw0:80792b80 sqhd:00aa p:1 m:0 dnr:0
[2023-11-15 11:43:48.482641] nvme_qpair.c: 255:nvme_io_qpair_print_command: *NOTICE*: IO COMMAND (7d) sqid:1 cid:60 nsid:1
[2023-11-15 11:43:48.482644] nvme_qpair.c: 474:spdk_nvme_print_completion: *NOTICE*: TOO MANY ACTIVE ZONES (01/bd) qid:1 cid:60 cdw0:80797540 sqhd:00aa p:1 m:0 dnr:0
[2023-11-15 11:43:48.482647] nvme_qpair.c: 255:nvme_io_qpair_print_command: *NOTICE*: IO COMMAND (7d) sqid:1 cid:62 nsid:1
[2023-11-15 11:43:48.482650] nvme_qpair.c: 474:spdk_nvme_print_completion: *NOTICE*: TOO MANY ACTIVE ZONES (01/bd) qid:1 cid:62 cdw0:80798440 sqhd:00aa p:1 m:0 dnr:0

测试代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
// rw_test.cc
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright (C) 2018 Intel Corporation.
* All rights reserved.
*/

#include "spdk/stdinc.h"
#include "spdk/thread.h"
#include "spdk/bdev.h"
#include "spdk/env.h"
#include "spdk/event.h"
#include "spdk/log.h"
#include "spdk/string.h"
#include "spdk/bdev_zone.h"

#include <atomic>

static const char *g_bdev_name = "Nvme0n1";

struct rwtest_context_t {
struct spdk_bdev *bdev;
struct spdk_bdev_desc *bdev_desc;
struct spdk_io_channel *bdev_io_channel;
char *write_buff;
char *read_buff;
uint32_t buff_size;
char *bdev_name;
struct spdk_bdev_io_wait_entry bdev_io_wait;

std::atomic<int> count; // 原子变量,避免并发修改冲突
};

static void read_zone_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) {
struct rwtest_context_t *test_context = static_cast<struct rwtest_context_t *>(cb_arg);
spdk_bdev_free_io(bdev_io);

if (!success) {
SPDK_ERRLOG("bdev io read zone error: %d\n", EIO);
spdk_put_io_channel(test_context->bdev_io_channel);
spdk_bdev_close(test_context->bdev_desc);
spdk_app_stop(-1);
return;
}
// 比对读缓冲区与写缓冲区内容
int cmp_res = memcmp(test_context->write_buff, test_context->read_buff, test_context->buff_size);
if (cmp_res != 0) {
SPDK_ERRLOG("read zone data error.\n");
spdk_put_io_channel(test_context->bdev_io_channel);
spdk_bdev_close(test_context->bdev_desc);
spdk_app_stop(-1);
return;
}
test_context->count.fetch_add(1);
if (test_context->count.load() == 4 * 0x100) { // 读取测试完成,结束测试
SPDK_NOTICELOG("read zone complete.\n");
spdk_put_io_channel(test_context->bdev_io_channel);
spdk_bdev_close(test_context->bdev_desc);
spdk_app_stop(0);
return;
}

memset(test_context->read_buff, 0x34, test_context->buff_size);
// 循环读取,直至读到最后一个测试LBA
uint64_t lba = test_context->count.load() / 0x100 * spdk_bdev_get_zone_size(test_context->bdev) +
test_context->count.load() % 0x100; // 计算对应的LBA

int rc = spdk_bdev_read_blocks(test_context->bdev_desc, test_context->bdev_io_channel, test_context->read_buff, lba,
1, read_zone_complete, test_context);
SPDK_NOTICELOG("read lba:0x%lx\n", lba);
if (rc != 0) {
SPDK_ERRLOG("%s error while reading from bdev: %d\n", spdk_strerror(-rc), rc);
spdk_put_io_channel(test_context->bdev_io_channel);
spdk_bdev_close(test_context->bdev_desc);
spdk_app_stop(-1);
return;
}
}

static void read_zone(void *arg) {
int rc = 0;
struct rwtest_context_t *test_context = static_cast<struct rwtest_context_t *>(arg);
test_context->count = 0;
memset(test_context->read_buff, 0x34, test_context->buff_size); // 读取前缓冲区内容为0x34
rc = spdk_bdev_read_blocks(test_context->bdev_desc, test_context->bdev_io_channel, test_context->read_buff, 0, 1,
read_zone_complete, test_context);
SPDK_NOTICELOG("read lba:0x%x\n", 0x0);
if (rc == -ENOMEM) {
SPDK_NOTICELOG("Queueing io\n");
/* In case we cannot perform I/O now, queue I/O */
test_context->bdev_io_wait.bdev = test_context->bdev;
test_context->bdev_io_wait.cb_fn = read_zone;
test_context->bdev_io_wait.cb_arg = test_context;
spdk_bdev_queue_io_wait(test_context->bdev, test_context->bdev_io_channel, &test_context->bdev_io_wait);
} else if (rc) {
SPDK_ERRLOG("%s error while reading from bdev: %d\n", spdk_strerror(-rc), rc);
spdk_put_io_channel(test_context->bdev_io_channel);
spdk_bdev_close(test_context->bdev_desc);
spdk_app_stop(-1);
}
}

static void write_zone_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) {
struct rwtest_context_t *test_context = static_cast<struct rwtest_context_t *>(cb_arg);
SPDK_NOTICELOG("append lba:0x%lx\n", spdk_bdev_io_get_append_location(bdev_io)); // 打印成功append位置
spdk_bdev_free_io(bdev_io);

if (!success) {
SPDK_ERRLOG("bdev io write zone error: %d\n", EIO);
spdk_put_io_channel(test_context->bdev_io_channel);
spdk_bdev_close(test_context->bdev_desc);
spdk_app_stop(-1);
return;
}
test_context->count.fetch_sub(1);
if (test_context->count.load() == 0) { // zone写入完成,开始读取数据验证
SPDK_NOTICELOG("write zone complete.\n");
read_zone(test_context);
}
}
static void write_zone(void *arg) {
int rc = 0;
struct rwtest_context_t *test_context = static_cast<struct rwtest_context_t *>(arg);
uint64_t zone_size = spdk_bdev_get_zone_size(test_context->bdev);
// 往4个zone的前0x100个block李写入0x12
int zone_num = 4;
int append_times = 0x100;
test_context->count = zone_num * append_times;
memset(test_context->write_buff, 0x12, test_context->buff_size);
for (uint64_t slba = 0; slba < zone_num * zone_size; slba += zone_size) {
for (int i = 0; i < append_times; i++) {
rc = spdk_bdev_zone_append(test_context->bdev_desc, test_context->bdev_io_channel, test_context->write_buff, slba,
1, write_zone_complete, test_context);
if (rc != 0) {
SPDK_ERRLOG("%s error while write_zone: %d\n", spdk_strerror(-rc), rc);
spdk_put_io_channel(test_context->bdev_io_channel);
spdk_bdev_close(test_context->bdev_desc);
spdk_app_stop(-1);
}
}
}
}
static void reset_zone_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) {
struct rwtest_context_t *test_context = static_cast<struct rwtest_context_t *>(cb_arg);
spdk_bdev_free_io(bdev_io);

if (!success) {
SPDK_ERRLOG("bdev io reset zone error: %d\n", EIO);
spdk_put_io_channel(test_context->bdev_io_channel);
spdk_bdev_close(test_context->bdev_desc);
spdk_app_stop(-1);
return;
}
test_context->count.fetch_sub(1);
if (test_context->count.load() == 0) { // zone重置完成,开始写入数据
SPDK_NOTICELOG("reset zone complete.\n");
write_zone(test_context);
}
}

static void reset_zone(void *arg) {
struct rwtest_context_t *test_context = static_cast<struct rwtest_context_t *>(arg);
int rc = 0;
// 重置前10个zone
int zone_num = 10;
test_context->count = zone_num;
uint64_t zone_size = spdk_bdev_get_zone_size(test_context->bdev);
for (uint64_t slba = 0; slba < zone_num * zone_size; slba += zone_size) {
rc = spdk_bdev_zone_management(test_context->bdev_desc, test_context->bdev_io_channel, slba, SPDK_BDEV_ZONE_RESET,
reset_zone_complete, test_context);
if (rc == -ENOMEM) {
SPDK_NOTICELOG("Queueing io\n");
/* In case we cannot perform I/O now, queue I/O */
test_context->bdev_io_wait.bdev = test_context->bdev;
test_context->bdev_io_wait.cb_fn = reset_zone;
test_context->bdev_io_wait.cb_arg = test_context;
spdk_bdev_queue_io_wait(test_context->bdev, test_context->bdev_io_channel, &test_context->bdev_io_wait);
} else if (rc) {
SPDK_ERRLOG("%s error while resetting zone: %d\n", spdk_strerror(-rc), rc);
spdk_put_io_channel(test_context->bdev_io_channel);
spdk_bdev_close(test_context->bdev_desc);
spdk_app_stop(-1);
}
}
}

static void test_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx) {
SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
}

static void test_start(void *arg1) {
struct rwtest_context_t *test_context = static_cast<struct rwtest_context_t *>(arg1);
uint32_t buf_align;
int rc = 0;
test_context->bdev = NULL;
test_context->bdev_desc = NULL;

SPDK_NOTICELOG("Successfully started the application\n");
SPDK_NOTICELOG("Opening the bdev %s\n", test_context->bdev_name);
rc = spdk_bdev_open_ext(test_context->bdev_name, true, test_bdev_event_cb, NULL, &test_context->bdev_desc);
if (rc) {
SPDK_ERRLOG("Could not open bdev: %s\n", test_context->bdev_name);
spdk_app_stop(-1);
return;
}
test_context->bdev = spdk_bdev_desc_get_bdev(test_context->bdev_desc);

SPDK_NOTICELOG("Opening io channel\n");
/* Open I/O channel */
test_context->bdev_io_channel = spdk_bdev_get_io_channel(test_context->bdev_desc);
if (test_context->bdev_io_channel == NULL) {
SPDK_ERRLOG("Could not create bdev I/O channel!!\n");
spdk_bdev_close(test_context->bdev_desc);
spdk_app_stop(-1);
return;
}
test_context->buff_size =
spdk_bdev_get_block_size(test_context->bdev) * spdk_bdev_get_write_unit_size(test_context->bdev);
buf_align = spdk_bdev_get_buf_align(test_context->bdev);
test_context->write_buff = static_cast<char *>(spdk_dma_zmalloc(test_context->buff_size, buf_align, NULL));
if (!test_context->write_buff) {
SPDK_ERRLOG("Failed to allocate buffer\n");
spdk_put_io_channel(test_context->bdev_io_channel);
spdk_bdev_close(test_context->bdev_desc);
spdk_app_stop(-1);
return;
}
test_context->read_buff = static_cast<char *>(spdk_dma_zmalloc(test_context->buff_size, buf_align, NULL));
if (!test_context->read_buff) {
SPDK_ERRLOG("Failed to allocate buffer\n");
spdk_put_io_channel(test_context->bdev_io_channel);
spdk_bdev_close(test_context->bdev_desc);
spdk_app_stop(-1);
return;
}

if (!spdk_bdev_is_zoned(test_context->bdev)) {
SPDK_ERRLOG("not a ZNS SSD\n");
spdk_put_io_channel(test_context->bdev_io_channel);
spdk_bdev_close(test_context->bdev_desc);
spdk_app_stop(-1);
return;
}
// 打印ZNS SSD一些信息
SPDK_NOTICELOG(
"block size:%d write unit:%d zone size:%lx zone num:%ld max append size:%d max open zone:%d max active "
"zone:%d\n",
spdk_bdev_get_block_size(test_context->bdev), spdk_bdev_get_write_unit_size(test_context->bdev),
spdk_bdev_get_zone_size(test_context->bdev), spdk_bdev_get_num_zones(test_context->bdev),
spdk_bdev_get_max_zone_append_size(test_context->bdev), spdk_bdev_get_max_open_zones(test_context->bdev),
spdk_bdev_get_max_active_zones(test_context->bdev));
reset_zone(test_context);
}

int main(int argc, char **argv) {
struct spdk_app_opts opts = {};
int rc = 0;
struct rwtest_context_t test_context = {};

spdk_app_opts_init(&opts, sizeof(opts));
opts.name = "test_bdev";

if ((rc = spdk_app_parse_args(argc, argv, &opts, NULL, NULL, NULL, NULL)) != SPDK_APP_PARSE_ARGS_SUCCESS) {
exit(rc);
}
test_context.bdev_name = const_cast<char *>(g_bdev_name);

rc = spdk_app_start(&opts, test_start, &test_context);
if (rc) {
SPDK_ERRLOG("ERROR starting application\n");
}

spdk_dma_free(test_context.write_buff);
spdk_dma_free(test_context.read_buff);

spdk_app_fini();
return rc;
}

注意采用上文动态库编译方法(实际上静态库包含所有的pkg-config估计也可以,只是我懒得再试了),traddr为测试设备的地址

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
// zns.json
{
"subsystems": [
{
"subsystem": "bdev",
"config": [
{
"method": "bdev_nvme_attach_controller",
"params": {
"trtype": "PCIe",
"name": "Nvme0",
"traddr": "0000:01:00.0"
}
}
]
}
]
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# makefile
CC := g++
PKG_CONFIG_PATH = /root/xxx/spdk/build/lib/pkgconfig
ALL_SPDK_LIBS := spdk_accel_ioat spdk_blobfs spdk_jsonrpc \
spdk_accel_modules spdk_blob spdk_log \
spdk_accel spdk_conf spdk_lvol \
spdk_bdev_aio spdk_dpdklibs spdk_nbd \
spdk_bdev_delay spdk_env_dpdk \
spdk_bdev_error spdk_env_dpdk_rpc spdk_notify \
spdk_bdev_ftl spdk_event_accel spdk_nvme \
spdk_bdev_gpt spdk_event_bdev spdk_nvmf \
spdk_bdev_lvol spdk_event_iscsi spdk_rpc \
spdk_bdev_malloc spdk_event_nbd spdk_scsi \
spdk_bdev_modules spdk_sock_modules \
spdk_bdev_null spdk_event_nvmf spdk_sock \
spdk_bdev_nvme spdk_event spdk_sock_posix \
spdk_bdev_passthru spdk_event_scsi spdk_syslibs \
spdk_bdev spdk_event_sock spdk_thread \
spdk_bdev_raid spdk_trace \
spdk_bdev_split spdk_event_vmd spdk_util \
spdk_bdev_virtio spdk_ftl \
spdk_bdev_zone_block spdk_ioat spdk_vhost \
spdk_blob_bdev spdk_iscsi spdk_virtio \
spdk_blobfs_bdev spdk_json spdk_vmd \
spdk_thread
SPDK_LIB := $(shell PKG_CONFIG_PATH="$(PKG_CONFIG_PATH)" pkg-config --cflags --libs $(ALL_SPDK_LIBS))



rw_test:rw_test.cc
$(CC) rw_test.cc -pthread -g -o rw_test -Wl,--no-as-needed $(SPDK_LIB) -Wl,--as-needed

clean:
rm rw_test

运行时输出(注意-c 指定json文件名)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
./rw_test -c zns.json
[2023-11-15 20:06:59.192639] Starting SPDK v23.09.1-pre git sha1 aa8059716 / DPDK 23.07.0 initialization...
[2023-11-15 20:06:59.192690] [ DPDK EAL parameters: test_bdev --no-shconf -c 0x1 --huge-unlink --log-level=lib.eal:6 --log-level=lib.cryptodev:5 --log-level=user1:6 --iova-mode=pa --base-virtaddr=0x200000000000 --match-allocations --file-prefix=spdk_pid1725836 ]
TELEMETRY: No legacy callbacks, legacy socket not created
[2023-11-15 20:06:59.199605] app.c: 786:spdk_app_start: *NOTICE*: Total cores available: 1
[2023-11-15 20:06:59.225503] reactor.c: 937:reactor_run: *NOTICE*: Reactor started on core 0
[2023-11-15 20:06:59.528947] rw_test.cc: 196:test_start: *NOTICE*: Successfully started the application
[2023-11-15 20:06:59.528977] rw_test.cc: 197:test_start: *NOTICE*: Opening the bdev Nvme0n1
[2023-11-15 20:06:59.528982] rw_test.cc: 206:test_start: *NOTICE*: Opening io channel
[2023-11-15 20:06:59.529413] rw_test.cc: 243:test_start: *NOTICE*: block size:4096 write unit:1 zone size:100000 zone num:2712 max append size:32 max open zone:8 max active zone:8
[2023-11-15 20:06:59.568018] rw_test.cc: 154:reset_zone_complete: *NOTICE*: reset zone complete.
[2023-11-15 20:06:59.568371] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0x0
[2023-11-15 20:06:59.568379] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0x1
[2023-11-15 20:06:59.568383] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0x2
[2023-11-15 20:06:59.568386] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0x3
[2023-11-15 20:06:59.568389] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0x4
[2023-11-15 20:06:59.568392] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0x5
[2023-11-15 20:06:59.568395] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0x6
[2023-11-15 20:06:59.568413] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0x7
[2023-11-15 20:06:59.568416] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0x8
[2023-11-15 20:06:59.568419] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0x9
[2023-11-15 20:06:59.568422] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0xa
[2023-11-15 20:06:59.568426] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0xb
[2023-11-15 20:06:59.568429] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0xc
[2023-11-15 20:06:59.568432] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0xd
[2023-11-15 20:06:59.568436] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0xe
[2023-11-15 20:06:59.568440] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0xf
[2023-11-15 20:06:59.568444] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0x10
# 省略部分输出
[2023-11-15 20:06:59.584544] rw_test.cc: 103:write_zone_complete: *NOTICE*: append lba:0x3000ff
[2023-11-15 20:06:59.584547] rw_test.cc: 115:write_zone_complete: *NOTICE*: write zone complete.
[2023-11-15 20:06:59.584555] rw_test.cc: 85:read_zone: *NOTICE*: read lba:0x0
[2023-11-15 20:06:59.584572] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0x1
[2023-11-15 20:06:59.584589] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0x2
[2023-11-15 20:06:59.584607] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0x3
[2023-11-15 20:06:59.584624] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0x4
[2023-11-15 20:06:59.584641] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0x5
[2023-11-15 20:06:59.584658] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0x6
[2023-11-15 20:06:59.584674] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0x7
[2023-11-15 20:06:59.584690] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0x8
[2023-11-15 20:06:59.584706] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0x9
[2023-11-15 20:06:59.584721] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0xa
[2023-11-15 20:06:59.584738] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0xb
[2023-11-15 20:06:59.584754] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0xc
[2023-11-15 20:06:59.584770] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0xd
[2023-11-15 20:06:59.584786] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0xe
[2023-11-15 20:06:59.584804] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0xf
[2023-11-15 20:06:59.584819] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0x10
# 省略部分输出
[2023-11-15 20:06:59.605052] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0x3000fd
[2023-11-15 20:06:59.605069] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0x3000fe
[2023-11-15 20:06:59.605085] rw_test.cc: 68:read_zone_complete: *NOTICE*: read lba:0x3000ff
[2023-11-15 20:06:59.605101] rw_test.cc: 54:read_zone_complete: *NOTICE*: read zone complete.

2 实现env基本功能

目标:利用SPDK使能ZNS SSD,使用SPDK的Bdev接口实现LevelDB Env,但只为了简单模拟测试,并不进行任何持久化,错误处理,并发保护的开发。简而言之,能跑起来就行,有bug也无所谓!

基本思路: 将helpers/memenv/memenv.cc代码照搬过来,只是额外记录数据对应的ZNS LBA,在读文件前将数据从ZNS读取到内存缓冲区,写文件完成后将数据由内存缓冲区写入到ZNS SSD并记录对应LBA。SPDK相关的实现与之前demo类似。

实现方式:同步接口调用SPDK异步的API,传递参数中包含信号量,接口调用API后调用信号量P操作,SPDK回调函数中调用信号量V操作,以此完成两者的同步。数据通路为读者/写者$\Longleftrightarrow$内存缓冲区$\Longleftrightarrow$ZNS SSD,需要注意的是,如果调用异步接口的参数类型为指针类型,那么需保证在异步执行过程中指针指向内存区域不失效(要么指向堆区,要么指向栈区,但待异步执行完后再退出)

编译时leveldb的CMakeLists.txt只需要加上新增文件名,并不需要加上spdk相关的动态库,编写测试demo的时候链接leveldb与spdk相关库,为了省事,直接复制粘贴之前demo make时生成的一长串库名而不是使用pkg-config

问题

Unref中互斥锁卡死

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
(gdb) r
Starting program: /root/xxx/spdk_env/leveldb-spdk-env/test_env/test
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
[New Thread 0x7ffff6a35700 (LWP 57227)]
[2023-11-23 07:02:39.573057] Starting SPDK v23.09.1-pre git sha1 aa8059716 / DPDK 23.07.0 initialization...
[2023-11-23 07:02:39.573113] [ DPDK EAL parameters: test_bdev --no-shconf -c 0x1 --huge-unlink --log-level=lib.eal:6 --log-level=lib.cryptodev:5 --log-level=user1:6 --iova-mode=pa --base-virtaddr=0x200000000000 --match-allocations --file-prefix=spdk_pid57223 ]
[New Thread 0x7ffff6234700 (LWP 57228)]
[New Thread 0x7ffff5a33700 (LWP 57229)]
TELEMETRY: No legacy callbacks, legacy socket not created
[2023-11-23 07:02:39.580418] app.c: 786:spdk_app_start: *NOTICE*: Total cores available: 1
[2023-11-23 07:02:39.606681] reactor.c: 937:reactor_run: *NOTICE*: Reactor started on core 0
[2023-11-23 07:02:39.930868] /root/xxx/spdk_env/leveldb-spdk-env/zns_spdk_env/spdk_api.cc: 23:start_fn: *NOTICE*: Successfully started the application
[2023-11-23 07:02:39.930889] /root/xxx/spdk_env/leveldb-spdk-env/zns_spdk_env/spdk_api.cc: 24:start_fn: *NOTICE*: Opening the bdev Nvme0n1
[2023-11-23 07:02:39.930895] /root/xxx/spdk_env/leveldb-spdk-env/zns_spdk_env/spdk_api.cc: 34:start_fn: *NOTICE*: Opening io channel
[2023-11-23 07:02:39.931095] /root/xxx/spdk_env/leveldb-spdk-env/zns_spdk_env/spdk_api.cc: 45:start_fn: *NOTICE*: block size:4096 write unit:1 zone size:100000 zone num:2712 max append size:32 max open zone:8 max active zone:8
[2023-11-23 07:02:39.931103] /root/xxx/spdk_env/leveldb-spdk-env/zns_spdk_env/spdk_api.cc: 57:start_fn: *NOTICE*: spdk init success.
ZNS SPDK Env init complete
^C
Thread 1 "test" received signal SIGINT, Interrupt.
__lll_lock_wait (futex=futex@entry=0x555555600f20, private=0) at lowlevellock.c:52
52 lowlevellock.c: No such file or directory.
(gdb) bt
#0 __lll_lock_wait (futex=futex@entry=0x555555600f20, private=0) at lowlevellock.c:52
#1 0x00007ffff71340a3 in __GI___pthread_mutex_lock (mutex=0x555555600f20) at ../nptl/pthread_mutex_lock.c:80
#2 0x0000555555565809 in __gthread_mutex_lock (__mutex=0x555555600f20) at /usr/include/x86_64-linux-gnu/c++/9/bits/gthr-default.h:749
#3 0x0000555555565cee in std::mutex::lock (this=0x555555600f20) at /usr/include/c++/9/bits/std_mutex.h:100
#4 0x0000555555565d40 in leveldb::port::Mutex::Lock (this=0x555555600f20) at /root/xxx/spdk_env/leveldb-spdk-env/./port/port_stdcxx.h:59
#5 0x0000555555566ffa in leveldb::MutexLock::MutexLock (this=0x7fffffffdac0, mu=0x555555600f20) at /root/xxx/spdk_env/leveldb-spdk-env/./util/mutexlock.h:26
#6 0x00005555555910fe in leveldb::FileState::Unref (this=0x555555600f20) at /root/xxx/spdk_env/leveldb-spdk-env/zns_spdk_env/filesystem.cc:21
#7 0x000055555559247b in leveldb::ZnsSpdkEnv::RemoveFileInternal (this=0x5555555fdd60, fname="./testdb/000001.dbtmp") at /root/xxx/spdk_env/leveldb-spdk-env/zns_spdk_env/filesystem.cc:237
#8 0x0000555555592582 in leveldb::ZnsSpdkEnv::RemoveFile (this=0x5555555fdd60, fname="./testdb/000001.dbtmp") at /root/xxx/spdk_env/leveldb-spdk-env/zns_spdk_env/filesystem.cc:247
#9 0x000055555555ed31 in leveldb::DBImpl::RemoveObsoleteFiles (this=0x5555555f4620) at /root/xxx/spdk_env/leveldb-spdk-env/db/db_impl.cc:287
#10 0x0000555555564fe3 in leveldb::DB::Open (options=..., dbname="./testdb", dbptr=0x7fffffffde30) at /root/xxx/spdk_env/leveldb-spdk-env/db/db_impl.cc:1527
#11 0x000055555555b5b0 in main () at test.cpp:39

解决思路

①查看其他线程是否正在占用互斥锁

1
2
3
4
5
6
(gdb) info thread
Id Target Id Frame
* 1 Thread 0x7ffff6a399c0 (LWP 70027) "test" __lll_lock_wait (futex=futex@entry=0x555555600f20, private=0) at lowlevellock.c:52
2 Thread 0x7ffff6a35700 (LWP 70031) "reactor_0" __tls_get_addr () at ../sysdeps/x86_64/tls_get_addr.S:38
3 Thread 0x7ffff6234700 (LWP 70032) "eal-intr-thread" 0x00007ffff705646e in epoll_wait (epfd=6, events=0x7ffff6232b80, maxevents=1, timeout=-1) at ../sysdeps/unix/sysv/linux/epoll_wait.c:30
4 Thread 0x7ffff5a33700 (LWP 70033) "telemetry-v2" 0x00007ffff713c4ff in __libc_accept (fd=9, addr=..., len=0x0) at ../sysdeps/unix/sysv/linux/accept.c:26

结果:并未发现占用refs_mutex_的线程

参考博客:GDB多线程调试-发现卡死的线程

②将代码中的port::Mutex MutexLock换成未封装的api std::mutex std::lock_guard

结果:报错仍然发生,不是封装api的问题

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
void FileState::Unref() {
bool do_delete = false;

{
std::lock_guard<std::mutex> lk(refs_mutex_);
--refs_;
assert(refs_ >= 0);
if (refs_ <= 0) {
do_delete = true;
}
}

if (do_delete) {
delete this;
}
}

③ 怀疑对应数据已被释放

卡死时变量值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
Thread 1 "test" received signal SIGINT, Interrupt.
__lll_lock_wait (futex=futex@entry=0x555555600f20, private=0) at lowlevellock.c:52
52 lowlevellock.c: No such file or directory.
(gdb) bt
#0 __lll_lock_wait (futex=futex@entry=0x555555600f20, private=0) at lowlevellock.c:52
#1 0x00007ffff71340a3 in __GI___pthread_mutex_lock (mutex=0x555555600f20) at ../nptl/pthread_mutex_lock.c:80
#2 0x0000555555565809 in __gthread_mutex_lock (__mutex=0x555555600f20) at /usr/include/x86_64-linux-gnu/c++/9/bits/gthr-default.h:749
#3 0x0000555555565cee in std::mutex::lock (this=0x555555600f20) at /usr/include/c++/9/bits/std_mutex.h:100
#4 0x000055555559397c in std::lock_guard<std::mutex>::lock_guard (this=0x7fffffffdac0, __m=...) at /usr/include/c++/9/bits/std_mutex.h:159
#5 0x00005555555910fe in leveldb::FileState::Unref (this=0x555555600f20) at /root/xxx/spdk_env/leveldb-spdk-env/zns_spdk_env/filesystem.cc:21
#6 0x000055555559247b in leveldb::ZnsSpdkEnv::RemoveFileInternal (this=0x5555555fdd60, fname="./testdb/000001.dbtmp") at /root/xxx/spdk_env/leveldb-spdk-env/zns_spdk_env/filesystem.cc:237
#7 0x0000555555592582 in leveldb::ZnsSpdkEnv::RemoveFile (this=0x5555555fdd60, fname="./testdb/000001.dbtmp") at /root/xxx/spdk_env/leveldb-spdk-env/zns_spdk_env/filesystem.cc:247
#8 0x000055555555ed31 in leveldb::DBImpl::RemoveObsoleteFiles (this=0x5555555f4620) at /root/xxx/spdk_env/leveldb-spdk-env/db/db_impl.cc:287
#9 0x0000555555564fe3 in leveldb::DB::Open (options=..., dbname="./testdb", dbptr=0x7fffffffde30) at /root/xxx/spdk_env/leveldb-spdk-env/db/db_impl.cc:1527
#10 0x000055555555b5b0 in main () at test.cpp:39
(gdb) f 5
#5 0x00005555555910fe in leveldb::FileState::Unref (this=0x555555600f20) at /root/xxx/spdk_env/leveldb-spdk-env/zns_spdk_env/filesystem.cc:21
21 std::lock_guard<std::mutex> lk(refs_mutex_);
(gdb) p *this
$1 = {refs_mutex_ = {<std::__mutex_base> = {_M_mutex = {__data = {__lock = 2, __count = 32767, __owner = -149799840, __nusers = 32767, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\377\177\000\000`<\022\367\377\177", '\000' <repeats 25 times>, __align = 140733193388034}}, <No data fields>}, refs_ = 0, blocks_mutex_ = {<std::__mutex_base> = {_M_mutex = {__data = {__lock = 0,
__count = 0, __owner = 0, __nusers = 0, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 39 times>, __align = 0}}, <No data fields>}, buffer_ = 0x200033a00000 "",
size_ = 0, block_addrs_ = std::vector of length 1, capacity 1 = {{start_block = 93824992725104, num_block = 1432137744}}}
(gdb) p refs_
$2 = 0

引用计数在获取锁时已变成0,_M_mutex数据构成也很奇怪

正常this输出

1
2
3
4
5
(gdb) p *this
$5 = {refs_mutex_ = {<std::__mutex_base> = {_M_mutex = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = '\000' <repeats 39 times>, __align = 0}}, <No data fields>}, refs_ = 2, blocks_mutex_ = {<std::__mutex_base> = {_M_mutex = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0, __spins = 0,
__elision = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 39 times>, __align = 0}}, <No data fields>}, buffer_ = 0x200033600000 "\225|\271\305\"", size_ = 41,
block_addrs_ = std::vector of length 1, capacity 1 = {{start_block = 292, num_block = 1}}}

故追踪每一个Unref调用,找到对应文件名(./testdb/000001.dbtmp)的上一个Unref,记录出错文件的引用计数变化

gdb时显示变量值的方法

1
2
p ((leveldb::ZnsSpdkEnv*) options.env)->file_map_
p *(leveldb::FileState*)0x555555600f20

问题关键

1
2
3
4
5
 p file_map_
$46 = std::map with 6 elements = {["./testdb/000001.dbtmp"] = 0x555555600f20, ["./testdb/000002.dbtmp"] = 0x555555603180, ["./testdb/000003.log"] = 0x5555555fca40, ["./testdb/CURRENT"] = 0x555555600f20,
["./testdb/MANIFEST-000001"] = 0x555555602bd0, ["./testdb/MANIFEST-000002"] = 0x555555603040}

# ./testdb/000001.dbtmp与./testdb/CURRENT指向同一个文件

错误原因:写代码时误删了file_map_.erase(src)这行,导致GetChildren方法返回了已被析构的文件名,导致调用锁时卡死。所以当线程因为互斥锁卡死时,一种可能是另一个线程持有这把锁没有释放,一种可能是互斥锁对应的空间已被析构,其数据为随机脏数据

ps:时常用git提交已经更改的代码并加上简单注释,避免误删代码debug很久的情况

leveldb单个文件大小

write_buffer_size参数如何影响log文件大小

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
  // Amount of data to build up in memory (backed by an unsorted log
// on disk) before converting to a sorted on-disk file.
//
// Larger values increase performance, especially during bulk loads.
// Up to two write buffers may be held in memory at the same time,
// so you may wish to adjust this parameter to control memory usage.
// Also, a larger write buffer will result in a longer recovery time
// the next time the database is opened.
size_t write_buffer_size = 4 * 1024 * 1024;

Status DBImpl::MakeRoomForWrite(bool force) {
mutex_.AssertHeld();
assert(!writers_.empty());
bool allow_delay = !force;
Status s;
while (true) {
if (!bg_error_.ok()) {
// Yield previous error
s = bg_error_;
break;
} else if (allow_delay && versions_->NumLevelFiles(0) >=
config::kL0_SlowdownWritesTrigger) {
// We are getting close to hitting a hard limit on the number of
// L0 files. Rather than delaying a single write by several
// seconds when we hit the hard limit, start delaying each
// individual write by 1ms to reduce latency variance. Also,
// this delay hands over some CPU to the compaction thread in
// case it is sharing the same core as the writer.
mutex_.Unlock();
env_->SleepForMicroseconds(1000);
allow_delay = false; // Do not delay a single write more than once
mutex_.Lock();
} else if (!force &&
(mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
// There is room in current memtable
// memtable未达到空间限制,仍可以写入
break;
}
}
}
// May temporarily unlock and wait.
Status status = MakeRoomForWrite(updates == nullptr);
uint64_t last_sequence = versions_->LastSequence();
Writer* last_writer = &w;
if (status.ok() && updates != nullptr) { // nullptr batch is for compactions
WriteBatch* write_batch = BuildBatchGroup(&last_writer);
WriteBatchInternal::SetSequence(write_batch, last_sequence + 1);
last_sequence += WriteBatchInternal::Count(write_batch);

// Add to log and apply to memtable. We can release the lock
// during this phase since &w is currently responsible for logging
// and protects against concurrent loggers and concurrent writes
// into mem_.
{
mutex_.Unlock();
status = log_->AddRecord(WriteBatchInternal::Contents(write_batch));

// 若达到空间限制,在将memtable写入sst文件后删除log,创建新的log文件
// Attempt to switch to a new memtable and trigger compaction of old
assert(versions_->PrevLogNumber() == 0);
uint64_t new_log_number = versions_->NewFileNumber();
WritableFile* lfile = nullptr;
s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
if (!s.ok()) {
// Avoid chewing through file number space in a tight loop.
versions_->ReuseFileNumber(new_log_number);
break;
}

delete log_;

s = logfile_->Close();
if (!s.ok()) {
// We may have lost some data written to the previous log file.
// Switch to the new log file anyway, but record as a background
// error so we do not attempt any more writes.
//
// We could perhaps attempt to save the memtable corresponding
// to log file and suppress the error if that works, but that
// would add more complexity in a critical code path.
RecordBackgroundError(s);
}
delete logfile_;

logfile_ = lfile;
logfile_number_ = new_log_number;
log_ = new log::Writer(lfile);
imm_ = mem_;
has_imm_.store(true, std::memory_order_release);
mem_ = new MemTable(internal_comparator_);
mem_->Ref();
force = false; // Do not force another compaction if have room
MaybeScheduleCompaction();

max_file_size参数如何影响compaction输出文件大小

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
  // Leveldb will write up to this amount of bytes to a file before
// switching to a new one.
// Most clients should leave this parameter alone. However if your
// filesystem is more efficient with larger files, you could
// consider increasing the value. The downside will be longer
// compactions and hence longer latency/performance hiccups.
// Another reason to increase this parameter might be when you are
// initially populating a large database.
size_t max_file_size = 2 * 1024 * 1024;


static size_t TargetFileSize(const Options* options) {
return options->max_file_size;
}

static uint64_t MaxFileSizeForLevel(const Options* options, int level) {
// We could vary per level to reduce number of files?
return TargetFileSize(options);
}

Compaction::Compaction(const Options* options, int level)
: level_(level),
max_output_file_size_(MaxFileSizeForLevel(options, level)),
input_version_(nullptr),
grandparent_index_(0),
seen_key_(false),
overlapped_bytes_(0) {
for (int i = 0; i < config::kNumLevels; i++) {
level_ptrs_[i] = 0;
}
}

uint64_t MaxOutputFileSize() const { return max_output_file_size_; }


// Close output file if it is big enough
if (compact->builder->FileSize() >=
compact->compaction->MaxOutputFileSize()) {
status = FinishCompactionOutputFile(compact, input);
if (!status.ok()) {
break;
}
}

not an sstable (bad magic number)

出错代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
Status Footer::DecodeFrom(Slice* input) {
if (input->size() < kEncodedLength) {
return Status::Corruption("not an sstable (footer too short)");
}

const char* magic_ptr = input->data() + kEncodedLength - 8;
const uint32_t magic_lo = DecodeFixed32(magic_ptr);
const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
const uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
(static_cast<uint64_t>(magic_lo)));
if (magic != kTableMagicNumber) {
return Status::Corruption("not an sstable (bad magic number)");
}
// 省略
}

以Corruption函数为断点

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
Thread 5 "test" hit Breakpoint 1, leveldb::Status::Corruption (msg=..., msg2=...) at /root/xxx/spdk_env/leveldb-spdk-env/include/leveldb/status.h:43
43 static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
(gdb) bt
#0 leveldb::Status::Corruption (msg=..., msg2=...) at /root/xxx/spdk_env/leveldb-spdk-env/include/leveldb/status.h:43
#1 0x00005555555a2e62 in leveldb::Footer::DecodeFrom (this=0x7fffefffd4a0, input=0x7fffefffd460) at /root/xxx/spdk_env/leveldb-spdk-env/table/format.cc:54
#2 0x000055555558a03c in leveldb::Table::Open (options=..., file=0x7fffe8000cd0, size=2838930, table=0x7fffefffd558) at /root/xxx/spdk_env/leveldb-spdk-env/table/table.cc:52
#3 0x0000555555577acc in leveldb::TableCache::FindTable (this=0x5555555fcb20, file_number=5, file_size=2838930, handle=0x7fffefffd628) at /root/xxx/spdk_env/leveldb-spdk-env/db/table_cache.cc:60
#4 0x0000555555577c5b in leveldb::TableCache::NewIterator (this=0x5555555fcb20, options=..., file_number=5, file_size=2838930, tableptr=0x0) at /root/xxx/spdk_env/leveldb-spdk-env/db/table_cache.cc:86
#5 0x000055555559ec06 in leveldb::BuildTable (dbname="./testdb", env=0x5555555fdd60, options=..., table_cache=0x5555555fcb20, iter=0x7fffe8000b90, meta=0x7fffefffd7f0)
at /root/xxx/spdk_env/leveldb-spdk-env/db/builder.cc:62
#6 0x0000555555560245 in leveldb::DBImpl::WriteLevel0Table (this=0x5555555f4620, mem=0x555555603240, edit=0x7fffefffd8b0, base=0x5555555fa2e0) at /root/xxx/spdk_env/leveldb-spdk-env/db/db_impl.cc:519
#7 0x0000555555560586 in leveldb::DBImpl::CompactMemTable (this=0x5555555f4620) at /root/xxx/spdk_env/leveldb-spdk-env/db/db_impl.cc:557
#8 0x0000555555560f5b in leveldb::DBImpl::BackgroundCompaction (this=0x5555555f4620) at /root/xxx/spdk_env/leveldb-spdk-env/db/db_impl.cc:706
#9 0x0000555555560ea9 in leveldb::DBImpl::BackgroundCall (this=0x5555555f4620) at /root/xxx/spdk_env/leveldb-spdk-env/db/db_impl.cc:691
#10 0x0000555555560dfe in leveldb::DBImpl::BGWork (db=0x5555555f4620) at /root/xxx/spdk_env/leveldb-spdk-env/db/db_impl.cc:680
#11 0x0000555555599e7e in leveldb::(anonymous namespace)::PosixEnv::BackgroundThreadMain (this=0x5555555ca1a0 <leveldb::Env::Default()::env_container>)
at /root/xxx/spdk_env/leveldb-spdk-env/util/env_posix.cc:850
#12 0x0000555555599af0 in leveldb::(anonymous namespace)::PosixEnv::BackgroundThreadEntryPoint (env=0x5555555ca1a0 <leveldb::Env::Default()::env_container>)
at /root/xxx/spdk_env/leveldb-spdk-env/util/env_posix.cc:751
#13 0x000055555559bc19 in std::__invoke_impl<void, void (*)(leveldb::(anonymous namespace)::PosixEnv*), leveldb::(anonymous namespace)::PosixEnv*> (
__f=@0x5555555e9c10: 0x555555599ad4 <leveldb::(anonymous namespace)::PosixEnv::BackgroundThreadEntryPoint(leveldb::(anonymous namespace)::PosixEnv*)>) at /usr/include/c++/9/bits/invoke.h:60
#14 0x000055555559bb79 in std::__invoke<void (*)(leveldb::(anonymous namespace)::PosixEnv*), leveldb::(anonymous namespace)::PosixEnv*> (
__fn=@0x5555555e9c10: 0x555555599ad4 <leveldb::(anonymous namespace)::PosixEnv::BackgroundThreadEntryPoint(leveldb::(anonymous namespace)::PosixEnv*)>) at /usr/include/c++/9/bits/invoke.h:95
#15 0x000055555559bad9 in std::thread::_Invoker<std::tuple<void (*)(leveldb::(anonymous namespace)::PosixEnv*), leveldb::(anonymous namespace)::PosixEnv*> >::_M_invoke<0, 1> (this=0x5555555e9c08)
at /usr/include/c++/9/thread:244
--Type <RET> for more, q to quit, c to continue without paging--
#16 0x000055555559ba7f in std::thread::_Invoker<std::tuple<void (*)(leveldb::(anonymous namespace)::PosixEnv*), leveldb::(anonymous namespace)::PosixEnv*> >::operator() (this=0x5555555e9c08)
at /usr/include/c++/9/thread:251
#17 0x000055555559ba54 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)(leveldb::(anonymous namespace)::PosixEnv*), leveldb::(anonymous namespace)::PosixEnv*> > >::_M_run (
this=0x5555555e9c00) at /usr/include/c++/9/thread:195
#18 0x00007ffff723fdf4 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#19 0x00007ffff7131609 in start_thread (arg=<optimized out>) at pthread_create.c:477
#20 0x00007ffff7056133 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
(gdb) f 1
#1 0x00005555555a2e62 in leveldb::Footer::DecodeFrom (this=0x7fffefffd4a0, input=0x7fffefffd460) at /root/xxx/spdk_env/leveldb-spdk-env/table/format.cc:54
warning: Source file is more recent than executable.
54 return Status::Corruption("not an sstable (bad magic number)");
(gdb) p/x magic
$1 = 0x62686878637a6462
(gdb) p/x kTableMagicNumber
$2 = 0xdb4775248b80fb57

思路:文件内容读取出错,首先追踪对应文件的写入过程,查看写入数据是否正常,而后再检查读取过程是否正常。

不过gdb调试时瞅了瞅代码找到了错误:将乘号写成了加号

assert(internal_key.size() >= 8)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
test: /root/xxx/spdk_env/leveldb-spdk-env/./db/dbformat.h:96: leveldb::Slice leveldb::ExtractUserKey(const leveldb::Slice&): Assertion `internal_key.size() >= 8' failed.

Thread 5 "test" received signal SIGABRT, Aborted.
[Switching to Thread 0x7fffeffff700 (LWP 213089)]
__GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
50 ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
(gdb) bt
#0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
#1 0x00007ffff6f59859 in __GI_abort () at abort.c:79
#2 0x00007ffff6f59729 in __assert_fail_base (fmt=0x7ffff70ef588 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x5555555a4380 "internal_key.size() >= 8",
file=0x5555555a4348 "/root/xxx/spdk_env/leveldb-spdk-env/./db/dbformat.h", line=96, function=<optimized out>) at assert.c:92
#3 0x00007ffff6f6afd6 in __GI___assert_fail (assertion=0x5555555a4380 "internal_key.size() >= 8", file=0x5555555a4348 "/root/xxx/spdk_env/leveldb-spdk-env/./db/dbformat.h", line=96,
function=0x5555555a4308 "leveldb::Slice leveldb::ExtractUserKey(const leveldb::Slice&)") at assert.c:101
#4 0x0000555555566024 in leveldb::ExtractUserKey (internal_key=...) at /root/xxx/spdk_env/leveldb-spdk-env/./db/dbformat.h:96
#5 0x00005555555737f8 in leveldb::InternalKeyComparator::Compare (this=0x5555555f58b8, akey=..., bkey=...) at /root/xxx/spdk_env/leveldb-spdk-env/db/dbformat.cc:52
#6 0x0000555555587d5a in leveldb::(anonymous namespace)::MergingIterator::FindSmallest (this=0x7fffe80029f0) at /root/xxx/spdk_env/leveldb-spdk-env/table/merger.cc:155
#7 0x00005555555879b1 in leveldb::(anonymous namespace)::MergingIterator::Next (this=0x7fffe80029f0) at /root/xxx/spdk_env/leveldb-spdk-env/table/merger.cc:78
#8 0x000055555556280a in leveldb::DBImpl::DoCompactionWork (this=0x5555555f4620, compact=0x55559a388600) at /root/xxx/spdk_env/leveldb-spdk-env/db/db_impl.cc:1013
#9 0x00005555555614a5 in leveldb::DBImpl::BackgroundCompaction (this=0x5555555f4620) at /root/xxx/spdk_env/leveldb-spdk-env/db/db_impl.cc:750
#10 0x0000555555560ea9 in leveldb::DBImpl::BackgroundCall (this=0x5555555f4620) at /root/xxx/spdk_env/leveldb-spdk-env/db/db_impl.cc:691
#11 0x0000555555560dfe in leveldb::DBImpl::BGWork (db=0x5555555f4620) at /root/xxx/spdk_env/leveldb-spdk-env/db/db_impl.cc:680
#12 0x0000555555599e7c in leveldb::(anonymous namespace)::PosixEnv::BackgroundThreadMain (this=0x5555555ca1a0 <leveldb::Env::Default()::env_container>)
at /root/xxx/spdk_env/leveldb-spdk-env/util/env_posix.cc:850
#13 0x0000555555599aee in leveldb::(anonymous namespace)::PosixEnv::BackgroundThreadEntryPoint (env=0x5555555ca1a0 <leveldb::Env::Default()::env_container>)
at /root/xxx/spdk_env/leveldb-spdk-env/util/env_posix.cc:751
#14 0x000055555559bc17 in std::__invoke_impl<void, void (*)(leveldb::(anonymous namespace)::PosixEnv*), leveldb::(anonymous namespace)::PosixEnv*> (
__f=@0x5555555e9c10: 0x555555599ad2 <leveldb::(anonymous namespace)::PosixEnv::BackgroundThreadEntryPoint(leveldb::(anonymous namespace)::PosixEnv*)>) at /usr/include/c++/9/bits/invoke.h:60
#15 0x000055555559bb77 in std::__invoke<void (*)(leveldb::(anonymous namespace)::PosixEnv*), leveldb::(anonymous namespace)::PosixEnv*> (
__fn=@0x5555555e9c10: 0x555555599ad2 <leveldb::(anonymous namespace)::PosixEnv::BackgroundThreadEntryPoint(leveldb::(anonymous namespace)::PosixEnv*)>) at /usr/include/c++/9/bits/invoke.h:95
#16 0x000055555559bad7 in std::thread::_Invoker<std::tuple<void (*)(leveldb::(anonymous namespace)::PosixEnv*), leveldb::(anonymous namespace)::PosixEnv*> >::_M_invoke<0, 1> (this=0x5555555e9c08)
--Type <RET> for more, q to quit, c to continue without paging--
at /usr/include/c++/9/thread:244
#17 0x000055555559ba7d in std::thread::_Invoker<std::tuple<void (*)(leveldb::(anonymous namespace)::PosixEnv*), leveldb::(anonymous namespace)::PosixEnv*> >::operator() (this=0x5555555e9c08)
at /usr/include/c++/9/thread:251
#18 0x000055555559ba52 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)(leveldb::(anonymous namespace)::PosixEnv*), leveldb::(anonymous namespace)::PosixEnv*> > >::_M_run (
this=0x5555555e9c00) at /usr/include/c++/9/thread:195
#19 0x00007ffff723fdf4 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#20 0x00007ffff7131609 in start_thread (arg=<optimized out>) at pthread_create.c:477
#21 0x00007ffff7056133 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
(gdb) f 3
#3 0x00007ffff6f6afd6 in __GI___assert_fail (assertion=0x5555555a4380 "internal_key.size() >= 8", file=0x5555555a4348 "/root/xxx/spdk_env/leveldb-spdk-env/./db/dbformat.h", line=96,
function=0x5555555a4308 "leveldb::Slice leveldb::ExtractUserKey(const leveldb::Slice&)") at assert.c:101
101 assert.c: No such file or directory.
(gdb) f 4
#4 0x0000555555566024 in leveldb::ExtractUserKey (internal_key=...) at /root/xxx/spdk_env/leveldb-spdk-env/./db/dbformat.h:96
96 assert(internal_key.size() >= 8);
(gdb) p internal_key.size()
$1 = 0
(gdb) f 5
#5 0x00005555555737f8 in leveldb::InternalKeyComparator::Compare (this=0x5555555f58b8, akey=..., bkey=...) at /root/xxx/spdk_env/leveldb-spdk-env/db/dbformat.cc:52
52 int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
(gdb) p akey
$2 = (const leveldb::Slice &) @0x7fffefffd730: {data_ = 0x7fffe8027030 "xlyzwbbnghkunsusoox\001>\257", size_ = 27}
(gdb) p bkey
$3 = (const leveldb::Slice &) @0x7fffefffd740: {data_ = 0x7fffe8003448 "", size_ = 0}
(gdb) f 6
#6 0x0000555555587d5a in leveldb::(anonymous namespace)::MergingIterator::FindSmallest (this=0x7fffe80029f0) at /root/xxx/spdk_env/leveldb-spdk-env/table/merger.cc:155
155 } else if (comparator_->Compare(child->key(), smallest->key()) < 0) {
(gdb) p smallest
$4 = (leveldb::IteratorWrapper *) 0x7fffe802e828
(gdb) p children_
$5 = (leveldb::IteratorWrapper *) 0x7fffe802e828
(gdb) p *children_
$6 = {iter_ = 0x7fffe8002240, valid_ = true, key_ = {data_ = 0x7fffe8003448 "", size_ = 0}}
(gdb) p smallest
$7 = (leveldb::IteratorWrapper *) 0x7fffe802e828
(gdb) p *smallest
$8 = {iter_ = 0x7fffe8002240, valid_ = true, key_ = {data_ = 0x7fffe8003448 "", size_ = 0}}
(gdb) p i
$9 = 1
(gdb) p children_
$10 = (leveldb::IteratorWrapper *) 0x7fffe802e828
(gdb) p children_[0]
$11 = {iter_ = 0x7fffe8002240, valid_ = true, key_ = {data_ = 0x7fffe8003448 "", size_ = 0}}
(gdb) p children_[1]
$12 = {iter_ = 0x7fffe8023310, valid_ = true, key_ = {data_ = 0x7fffe8027030 "xlyzwbbnghkunsusoox\001>\257", size_ = 27}}
(gdb) p children_[2]
$13 = {iter_ = 0x25, valid_ = 232, key_ = {data_ = 0x7fffe80295f0 "", size_ = 124249108909781}}
(gdb) p children_[3]
$14 = {iter_ = 0x25, valid_ = false, key_ = {data_ = 0x7fffe8039380 "\260\351\002\350\377\177", size_ = 32}}
(gdb) p children_[4]
$15 = {iter_ = 0x25, valid_ = 144, key_ = {data_ = 0x3f48 <error: Cannot access memory at address 0x3f48>, size_ = 4294981456}}
(gdb) p n_
$16 = 2
(gdb) p smallest
$17 = (leveldb::IteratorWrapper *) 0x7fffe802e828
(gdb) p *smallest
$18 = {iter_ = 0x7fffe8002240, valid_ = true, key_ = {data_ = 0x7fffe8003448 "", size_ = 0}}
(gdb) p current_
$19 = (leveldb::IteratorWrapper *) 0x7fffe802e828
(gdb) p *current_
$20 = {iter_ = 0x7fffe8002240, valid_ = true, key_ = {data_ = 0x7fffe8003448 "", size_ = 0}}
(gdb) f 8
#8 0x000055555556280a in leveldb::DBImpl::DoCompactionWork (this=0x5555555f4620, compact=0x55559a388600) at /root/xxx/spdk_env/leveldb-spdk-env/db/db_impl.cc:1013
1013 input->Next();
(gdb) p input
$21 = (leveldb::Iterator *) 0x7fffe80029f0
(gdb) p *input
$22 = {_vptr.Iterator = 0x5555555c89f8 <vtable for leveldb::(anonymous namespace)::MergingIterator+16>, cleanup_head_ = {function = 0x0, arg1 = 0x2b5028, arg2 = 0x7fffe8003550, next = 0x0}}
(gdb) p *(MergingIterator*)input
No symbol "MergingIterator" in current context.
(gdb) p *(leveldb::MergingIterator*)input
A syntax error in expression, near `)input'.
(gdb) p compact->compaction
$23 = (leveldb::Compaction * const) 0x7fffe80231b0
(gdb) p *compact->compaction
$24 = {level_ = 1, max_output_file_size_ = 2097152, input_version_ = 0x7fffe8002370, edit_ = {comparator_ = "", log_number_ = 0, prev_log_number_ = 0, next_file_number_ = 0, last_sequence_ = 0,
has_comparator_ = false, has_log_number_ = false, has_prev_log_number_ = false, has_next_file_number_ = false, has_last_sequence_ = false, compact_pointers_ = std::vector of length 1, capacity 1 = {{
first = 1, second = {rep_ = "zzzxdjxeeqwfcfipiriuoztlqehs\001wh\001\000\000\000"}}}, deleted_files_ = std::set with 0 elements, new_files_ = std::vector of length 0, capacity 0}, inputs_ = {
std::vector of length 7, capacity 7 = {0x7fffe8023670, 0x7fffe8023550, 0x7fffe802ee10, 0x7fffe802ea60, 0x7fffe8025e50, 0x7fffe8025f30, 0x7fffe8026010}, std::vector of length 1, capacity 1 = {
0x7fffe8000b90}}, grandparents_ = std::vector of length 0, capacity 0, grandparent_index_ = 0, seen_key_ = true, overlapped_bytes_ = 0, level_ptrs_ = {0, 0, 0, 0, 0, 0, 0}}
(gdb) p compact->compaction->input
$25 = {leveldb::FileMetaData *(const leveldb::Compaction * const, int, int)} 0x555555566f2c <leveldb::Compaction::input(int, int) const>
(gdb) p compact->compaction->input[0]
cannot subscript requested type
(gdb) p compact->compaction->inputs_
$26 = {std::vector of length 7, capacity 7 = {0x7fffe8023670, 0x7fffe8023550, 0x7fffe802ee10, 0x7fffe802ea60, 0x7fffe8025e50, 0x7fffe8025f30, 0x7fffe8026010}, std::vector of length 1, capacity 1 = {
0x7fffe8000b90}}
(gdb) p compact->compaction->inputs_[0]
$27 = std::vector of length 7, capacity 7 = {0x7fffe8023670, 0x7fffe8023550, 0x7fffe802ee10, 0x7fffe802ea60, 0x7fffe8025e50, 0x7fffe8025f30, 0x7fffe8026010}
(gdb) p compact->compaction->inputs_[1]
$28 = std::vector of length 1, capacity 1 = {0x7fffe8000b90}
(gdb) f 6
#6 0x0000555555587d5a in leveldb::(anonymous namespace)::MergingIterator::FindSmallest (this=0x7fffe80029f0) at /root/xxx/spdk_env/leveldb-spdk-env/table/merger.cc:155
155 } else if (comparator_->Compare(child->key(), smallest->key()) < 0) {
(gdb) p child
$29 = (leveldb::IteratorWrapper *) 0x7fffe802e848
(gdb) p *child
$30 = {iter_ = 0x7fffe8023310, valid_ = true, key_ = {data_ = 0x7fffe8027030 "xlyzwbbnghkunsusoox\001>\257", size_ = 27}}
(gdb) p smallest
$31 = (leveldb::IteratorWrapper *) 0x7fffe802e828
(gdb) p *smallest
$32 = {iter_ = 0x7fffe8002240, valid_ = true, key_ = {data_ = 0x7fffe8003448 "", size_ = 0}}
(gdb) p smallest->iter
$33 = {leveldb::Iterator *(const leveldb::IteratorWrapper * const)} 0x55555558bd80 <leveldb::IteratorWrapper::iter() const>
(gdb) p *smallest->iter
Attempt to take contents of a non-pointer value.

转换思路

使用自定义的env后,出现各式各样的问题,通过打印调用栈显现的问题原因也大不一样,一个一个调试起来非常麻烦,不如转换思路,问题的原因一定出在自定义env的实现上,故直接增加测试代码,比对写入前的数据与读取出来的数据是否一致,同时更改实现方式,将文件数据先写入普通的内存缓冲区,而后拷贝到spdk申请的缓冲区并写入ZNS设备,读取前清空spdk内存缓冲区,读取后比对普通缓冲区与spdk缓冲区数据。特别的是,env只使用普通内存缓冲区中的数据,spdk缓冲区数据只做一个测试用途,这样就不会影响leveldb的正常运行,因为即使设备返回数据为错误的数据,文件读取时照样从普通内存数据中读取数据。

spdk app的结束

env的申请与注销

1
2
options.env = leveldb::Env::NewZnsSpdk(leveldb::Env::Default());  // spdk env实现
delete options.env;

析构函数

1
2
3
4
5
6
7
8
ZnsSpdkEnv::~ZnsSpdkEnv() {
for (const auto& kvp : file_map_) {
kvp.second->Unref();
}
SpdkApi::AppStop();
spdk_app_thread_.join();
printf("ZNS SPDK Env destroy complete\n");
}
1
2
3
4
5
6
7
8
9
10
11
12
13
// 释放之前申请的资源并停止app例程
void close_bdev(void* arg) {
SPDK_NOTICELOG("close spdk bdev.\n");
SpdkInfo* spdk_info = static_cast<SpdkInfo*>(arg);
spdk_put_io_channel(spdk_info->bdev_io_channel);
spdk_bdev_close(spdk_info->bdev_desc);
spdk_app_stop(0);
}

// 结束app线程生命周期
void SpdkApi::AppStop() {
spdk_thread_send_msg(spdk_thread_get_app_thread(), close_bdev, &g_spdk_info);
}

特别注意的是,有一些函数只能在特定线程运行,故需借助spdk_thread_send_msg函数在app_thread运行,并创建辅助函数close_bdev完成指针类型转换

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
/**
* Release a reference to an I/O channel. This happens asynchronously.
*
* This must be called on the same thread that called spdk_get_io_channel()
* for the specified I/O channel. If this releases the last reference to the
* I/O channel, The destroy_cb function specified in spdk_io_device_register()
* will be invoked to release any associated resources.
*
* \param ch I/O channel to release a reference.
*/
void spdk_put_io_channel(struct spdk_io_channel *ch

/**
* Close a previously opened block device.
*
* Must be called on the same thread that the spdk_bdev_open_ext()
* was performed on.
*
* \param desc Block device descriptor to close.
*/
void spdk_bdev_close(struct spdk_bdev_desc *desc);

注意结束app前需释放之前申请的一系列资源,关闭设备。

奇怪的问题

问题表征

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 问题1
bdev.c:1027: bdev_io_decrement_outstanding: Assertion `bdev_ch->io_outstanding > 0' failed.

# 问题2
[2023-12-27 15:13:30.040118] nvme_pcie_common.c: 927:nvme_pcie_qpair_process_completions: *ERROR*: cpl does not map to outstanding cmd
[2023-12-27 15:13:30.040142] nvme_qpair.c: 474:spdk_nvme_print_completion: *NOTICE*: SUCCESS (00/00) qid:1 cid:152 cdw0:35a57 sqhd:00f5 p:0 m:0 dnr:0
test: nvme_pcie_common.c:929: nvme_pcie_qpair_process_completions: Assertion `0' failed.

# 问题3
nvme_pcie_common.c:694: nvme_pcie_qpair_complete_tracker: Assertion `cpl->cid == req->cmd.cid' failed.

# 问题4
[2023-12-27 15:24:40.111945] nvme_qpair.c: 255:nvme_io_qpair_print_command: *NOTICE*: IO COMMAND (7d) sqid:1 cid:191 nsid:2
[2023-12-27 15:24:40.111962] nvme_qpair.c: 474:spdk_nvme_print_completion: *NOTICE*: DATA SGL LENGTH INVALID (00/0f) qid:1 cid:191 cdw0:0 sqhd:000c p:1 m:1 dnr:1
append lba:0
[2023-12-27 15:24:40.111969] /root/leveldb-spdk-env/zns_spdk_env/spdk_api.cc: 159:WriteCpl: *ERROR*: bdev io write zone error: 5

这些问题的出错位置都在spdk,阅读自定义env使用spdk接口的代码,也没发现啥问题,所以问题原因比较难找到。

由于问题是随机出现的,感觉是并发导致的问题,故在自定义env操作前后加上锁,保证操作的原子性,不过并没有啥效果,所以直接在spdk库代码中加入一些辅助变量,通过调试spdk找到问题的原因。

Assertion `cpl->cid == req->cmd.cid’ failed. 分析
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
(gdb) p pqpair->tr[157].req
$20 = (struct nvme_request *) 0x200032037300
(gdb) p pqpair->tr[158].req
$21 = (struct nvme_request *) 0x200032037300
(gdb) p pqpair->tr[158]
$22 = {tq_list = {tqe_next = 0x2000070da000, tqe_prev = 0x2000320fe5f0}, req = 0x200032037300, cid = 158, bad_vtophys = 0, rsvd0 = 0, rsvd1 = 0,
cb_fn = 0x7ffff75e3938 <bdev_nvme_zone_appendv_done>, cb_arg = 0x200013a97d98, prp_sgl_bus_addr = 13498167368, meta_sgl = {address = 0, {generic = {
reserved = "\000\000\000\000\000\000", subtype = 0 '\000', type = 0 '\000'}, unkeyed = {length = 0, reserved = "\000\000", subtype = 0 '\000', type = 0 '\000'},
keyed = {length = 0, key = 0, subtype = 0, type = 0}}}, u = {prp = {12132417536, 131072, 0 <repeats 501 times>}, sgl = {{address = 12132417536, {generic = {
reserved = "\000\000\002\000\000\000", subtype = 0 '\000', type = 0 '\000'}, unkeyed = {length = 131072, reserved = "\000\000", subtype = 0 '\000',
type = 0 '\000'}, keyed = {length = 131072, key = 0, subtype = 0, type = 0}}}, {address = 0, {generic = {reserved = "\000\000\000\000\000\000",
subtype = 0 '\000', type = 0 '\000'}, unkeyed = {length = 0, reserved = "\000\000", subtype = 0 '\000', type = 0 '\000'}, keyed = {length = 0, key = 0,
subtype = 0, type = 0}}} <repeats 249 times>}}}
(gdb) p pqpair->tr[157]
$23 = {tq_list = {tqe_next = 0x2000070fb000, tqe_prev = 0x2000070db000}, req = 0x200032037300, cid = 157, bad_vtophys = 0, rsvd0 = 0, rsvd1 = 0,
cb_fn = 0x7ffff75e3938 <bdev_nvme_zone_appendv_done>, cb_arg = 0x200013a97858, prp_sgl_bus_addr = 13498163272, meta_sgl = {address = 0, {generic = {
reserved = "\000\000\000\000\000\000", subtype = 0 '\000', type = 0 '\000'}, unkeyed = {length = 0, reserved = "\000\000", subtype = 0 '\000', type = 0 '\000'},
keyed = {length = 0, key = 0, subtype = 0, type = 0}}}, u = {prp = {12132548608, 131072, 0 <repeats 501 times>}, sgl = {{address = 12132548608, {generic = {
reserved = "\000\000\002\000\000\000", subtype = 0 '\000', type = 0 '\000'}, unkeyed = {length = 131072, reserved = "\000\000", subtype = 0 '\000',
type = 0 '\000'}, keyed = {length = 131072, key = 0, subtype = 0, type = 0}}}, {address = 0, {generic = {reserved = "\000\000\000\000\000\000",
subtype = 0 '\000', type = 0 '\000'}, unkeyed = {length = 0, reserved = "\000\000", subtype = 0 '\000', type = 0 '\000'}, keyed = {length = 0, key = 0,
subtype = 0, type = 0}}} <repeats 249 times>}}}

不同cid对应的tracker req却一致

cid的赋值代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
int
nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
{
struct nvme_tracker *tr;
int rc = 0;
struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
enum nvme_payload_type payload_type;
bool sgl_supported;
bool mptr_sgl_supported;
bool dword_aligned = true;

if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
}

tr = TAILQ_FIRST(&pqpair->free_tr);

if (tr == NULL) {
pqpair->stat->queued_requests++;
/* Inform the upper layer to try again later. */
rc = -EAGAIN;
goto exit;
}

pqpair->stat->submitted_requests++;
TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */
TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list);
tr->req = req;
tr->cb_fn = req->cb_fn;
tr->cb_arg = req->cb_arg;
req->cmd.cid = tr->cid; // req的cid即为tracker的cid
1
2
3
4
5
6
const struct spdk_nvme_transport_ops pcie_ops = {
.name = "PCIE",

.qpair_submit_request = nvme_pcie_qpair_submit_request,
.qpair_process_completions = nvme_pcie_qpair_process_completions,
};
bdev_ch->io_outstanding > 0 分析

调用栈

spdk_bdev_channel增加成员变量call_inc_times,call_dec_times,进行spdk库的调试

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
static inline void
bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch,
struct spdk_bdev_shared_resource *shared_resource)
{
bdev_ch->call_inc_times++;
bdev_ch->io_outstanding++;
shared_resource->io_outstanding++;
}

static inline void
bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch,
struct spdk_bdev_shared_resource *shared_resource)
{
bdev_ch->call_dec_times++;
assert(bdev_ch->io_outstanding > 0);
assert(shared_resource->io_outstanding > 0);
bdev_ch->io_outstanding--;
shared_resource->io_outstanding--;
}
1
2
3
4
5
(gdb) p *bdev_ch
$3 = {bdev = 0x7ffff02422d0, channel = 0x7ffff0242b60, accel_channel = 0x7ffff00ae960, shared_resource = 0x7ffff02418d0, stat = 0x7ffff00053a0, io_outstanding = 0, call_inc_times = 976, call_dec_times = 975, io_submitted = {
tqh_first = 0x200013a99e80, tqh_last = 0x200013a9a1e8}, io_locked = {tqh_first = 0x0, tqh_last = 0x7ffff0242ae0}, io_accel_exec = {tqh_first = 0x0, tqh_last = 0x7ffff0242af0}, io_memory_domain = {tqh_first = 0x0,
tqh_last = 0x7ffff0242b00}, flags = 0, histogram = 0x0, queued_resets = {tqh_first = 0x0, tqh_last = 0x7ffff0242b20}, locked_ranges = {tqh_first = 0x0, tqh_last = 0x7ffff0242b30}, qos_queued_io = {tqh_first = 0x0,
tqh_last = 0x7ffff0242b40}}

发现执行次数没啥大问题

阅读代码,产生疑问,修改bdev_ch->io_outstanding是线程安全的吗?

阅读bdev_io_do_submit函数代码与spdk_bdev_io_complete函数代码,并没有发现锁,原子变量的机制

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
static inline void
bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io)
{
struct spdk_bdev *bdev = bdev_io->bdev;
struct spdk_io_channel *ch = bdev_ch->channel;
struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;

if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) {
bdev_io_increment_outstanding(bdev_ch, shared_resource); // 增加计数
bdev_io->internal.in_submit_request = true;
bdev_submit_request(bdev, ch, bdev_io);
bdev_io->internal.in_submit_request = false;
} else {
bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT);
if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) {
/* Special case when we have nomem IOs and no outstanding IOs which completions
* could trigger retry of queued IOs */
bdev_shared_ch_retry_io(shared_resource);
}
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
void
spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
{
struct spdk_bdev *bdev = bdev_io->bdev;
struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;

bdev_io_decrement_outstanding(bdev_ch, shared_resource); // 减少计数
if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) {
bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb);
return;
} else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0 &&
!bdev_io_use_accel_sequence(bdev_io))) {
_bdev_io_push_bounce_data_buffer(bdev_io,
_bdev_io_complete_push_bounce_done);
/* bdev IO will be completed in the callback */
return;
}
}
bdev_io_complete(bdev_io);
}

推论:两函数想要保证对bdev_ch成员访问的正确性,这两个函数就必须运行在同一线程中

以这个观点回顾之前自定义env的代码,发现我是直接用main线程调用spdk bdev接口,最重要的是,接口使用的是app thread获取的channel而不是本线程的,可能是这个原因导致了这些问题。

channel代码分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
/**
* \brief Represents a per-thread channel for accessing an I/O device.
*
* An I/O device may be a physical entity (i.e. NVMe controller) or a software
* entity (i.e. a blobstore).
*
* This structure is not part of the API - all accesses should be done through
* spdk_io_channel function calls.
*/
struct spdk_io_channel {
struct spdk_thread *thread;
struct io_device *dev;
uint32_t ref;
uint32_t destroy_ref;
RB_ENTRY(spdk_io_channel) node;
spdk_io_channel_destroy_cb destroy_cb;

uint8_t _padding[40];
/*
* Modules will allocate extra memory off the end of this structure
* to store references to hardware-specific references (i.e. NVMe queue
* pairs, or references to child device spdk_io_channels (i.e.
* virtual bdevs).
*/
};

SPDK_STATIC_ASSERT(sizeof(struct spdk_io_channel) == SPDK_IO_CHANNEL_STRUCT_SIZE, "incorrect size");

#endif /* SPDK_THREAD_INTERNAL_H_ */

以它的用法可以看出,channel在spdk线程安全上起到很大的作用(per-thread),即保证发送请求的函数与完成请求的函数在同一个spdk_thread上执行,那么这是怎样实现的呢?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
struct spdk_io_channel *
spdk_get_io_channel(void *io_device)
{
struct spdk_io_channel *ch;
struct spdk_thread *thread;
struct io_device *dev;
int rc;

pthread_mutex_lock(&g_devlist_mutex);
dev = io_device_get(io_device); // 获取对应的设备抽象
if (dev == NULL) {
SPDK_ERRLOG("could not find io_device %p\n", io_device);
pthread_mutex_unlock(&g_devlist_mutex);
return NULL;
}

thread = _get_thread(); // 获取线程专属变量tls_thread
if (!thread) {
SPDK_ERRLOG("No thread allocated\n");
pthread_mutex_unlock(&g_devlist_mutex);
return NULL;
}

ch = thread_get_io_channel(thread, dev); // 尝试获取对应spdk_thread的channel
if (ch != NULL) { // 存在对应channel,增加引用计数并返回
ch->ref++;
return ch;
}
// 创建新channel
ch = calloc(1, sizeof(*ch) + dev->ctx_size);

ch->dev = dev;
ch->destroy_cb = dev->destroy_cb;
ch->thread = thread;
ch->ref = 1;
ch->destroy_ref = 0;
RB_INSERT(io_channel_tree, &thread->io_channels, ch);

dev->refcnt++;

pthread_mutex_unlock(&g_devlist_mutex);
// 调用io_device创建时注册的初始化函数
rc = dev->create_cb(io_device, (uint8_t *)ch + sizeof(*ch));
return ch;
}

以上都是一些通用操作,重点看dev->create_cb函数调用

回溯找到对应的io_device注册代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
g_spdk_info.bdev_io_channel = spdk_bdev_get_io_channel(g_spdk_info.bdev_desc);

struct spdk_io_channel *
spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc)
{
return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc)));
}

#define __bdev_to_io_dev(bdev) (((char *)bdev) + 1)


struct spdk_bdev {
/** User context passed in by the backend */
void *ctxt;

/** Unique name for this block device. */
char *name;
}

可以看出io_device就是spdk_bdev结构体的ctxt指针

找到对应ctxt赋值代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
static int
nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
uint32_t prchk_flags, void *ctx)
{
const struct spdk_uuid *uuid;
const uint8_t *nguid;
const struct spdk_nvme_ctrlr_data *cdata;
const struct spdk_nvme_ns_data *nsdata;
const struct spdk_nvme_ctrlr_opts *opts;
enum spdk_nvme_csi csi;
uint32_t atomic_bs, phys_bs, bs;
char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'};

cdata = spdk_nvme_ctrlr_get_data(ctrlr);
csi = spdk_nvme_ns_get_csi(ns);
opts = spdk_nvme_ctrlr_get_opts(ctrlr);


disk->ctxt = ctx; // ctxt赋值操作
disk->fn_table = &nvmelib_fn_table;
disk->module = &nvme_if;

return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
static int
nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
{
struct nvme_bdev *bdev;
struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr;
int rc;

bdev = nvme_bdev_alloc();

rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr,
nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev);

spdk_io_device_register(bdev,
bdev_nvme_create_bdev_channel_cb,
bdev_nvme_destroy_bdev_channel_cb,
sizeof(struct nvme_bdev_channel),
bdev->disk.name);

nvme_ns->bdev = bdev;
bdev->nsid = nvme_ns->id;
TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);

bdev->nbdev_ctrlr = nbdev_ctrlr;
TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq);

rc = spdk_bdev_register(&bdev->disk);

return 0;
}

可以看出create_cb即为bdev_nvme_create_bdev_channel_cb

阅读代码并没有发现啥重要的信息,再次倒退

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
static inline void
_bdev_io_complete(void *ctx)
{
struct spdk_bdev_io *bdev_io = ctx;
assert(bdev_io->internal.cb != NULL);
assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io));

bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
bdev_io->internal.caller_ctx);
}


struct spdk_thread *
spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io)
{
return spdk_io_channel_get_thread(bdev_io->internal.ch->channel);
}

可以看出,确实有发送bdev_io与完成bdev_io存在同一个spdk_thread限制(不过不知道为什么之前有些请求照样能运行,没有触发断言)

对照调用栈寻找原因

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
(gdb) bt
#0 _bdev_io_complete (ctx=0x200013a964c0) at bdev.c:7136
#1 0x00007ffff79277bc in bdev_io_complete (ctx=0x200013a964c0) at bdev.c:7171
#2 0x00007ffff7927d2b in spdk_bdev_io_complete (bdev_io=0x200013a964c0, status=SPDK_BDEV_IO_STATUS_SUCCESS) at bdev.c:7299
#3 0x00007ffff7928133 in spdk_bdev_io_complete_nvme_status (bdev_io=0x200013a964c0, cdw0=0, sct=0, sc=0) at bdev.c:7408
#4 0x00007ffff75d4050 in __bdev_nvme_io_complete (bdev_io=0x200013a964c0, status=SPDK_BDEV_IO_STATUS_PENDING, cpl=0x20003200a160) at bdev_nvme.c:779
#5 0x00007ffff75d5484 in bdev_nvme_io_complete_nvme_status (bio=0x200013a96898, cpl=0x20003200a160) at bdev_nvme.c:1391
#6 0x00007ffff75e3850 in bdev_nvme_readv_done (ref=0x200013a96898, cpl=0x20003200a160) at bdev_nvme.c:7094
#7 0x00007ffff749d35d in nvme_complete_request (cb_fn=0x7ffff75e36a4 <bdev_nvme_readv_done>, cb_arg=0x200013a96898, qpair=0x2000320fe620, req=0x200032036e00, cpl=0x20003200a160) at /root/spdk/lib/nvme/nvme_internal.h:1412
--Type <RET> for more, q to quit, c to continue without paging--
#8 0x00007ffff749f574 in nvme_pcie_qpair_complete_tracker (qpair=0x2000320fe620, tr=0x2000070d7000, cpl=0x20003200a160, print_on_error=true) at nvme_pcie_common.c:706
#9 0x00007ffff749fcc4 in nvme_pcie_qpair_process_completions (qpair=0x2000320fe620, max_completions=64) at nvme_pcie_common.c:925
#10 0x00007ffff74ad888 in nvme_transport_qpair_process_completions (qpair=0x2000320fe620, max_completions=0) at nvme_transport.c:610
#11 0x00007ffff74a6b52 in spdk_nvme_qpair_process_completions (qpair=0x2000320fe620, max_completions=0) at nvme_qpair.c:791
#12 0x00007ffff74a1ae3 in nvme_pcie_poll_group_process_completions (tgroup=0x7ffff00ae8d0, completions_per_qpair=0, disconnected_qpair_cb=0x7ffff75d58b4 <bdev_nvme_disconnected_qpair_cb>) at nvme_pcie_common.c:1763
#13 0x00007ffff74add37 in nvme_transport_poll_group_process_completions (tgroup=0x7ffff00ae8d0, completions_per_qpair=0, disconnected_qpair_cb=0x7ffff75d58b4 <bdev_nvme_disconnected_qpair_cb>) at nvme_transport.c:714
#14 0x00007ffff74c1599 in spdk_nvme_poll_group_process_completions (group=0x7ffff00977c0, completions_per_qpair=0, disconnected_qpair_cb=0x7ffff75d58b4 <bdev_nvme_disconnected_qpair_cb>) at nvme_poll_group.c:157
#15 0x00007ffff75d5b99 in bdev_nvme_poll (arg=0x7ffff0008c50) at bdev_nvme.c:1616
--Type <RET> for more, q to quit, c to continue without paging--
#16 0x00007ffff7379748 in thread_execute_poller (thread=0x7ffff00093e0, poller=0x7ffff0097820) at thread.c:953
#17 0x00007ffff7379cff in thread_poll (thread=0x7ffff00093e0, max_msgs=0, now=1635704191540816) at thread.c:1079
#18 0x00007ffff7379fb7 in spdk_thread_poll (thread=0x7ffff00093e0, max_msgs=0, now=1635704191540816) at thread.c:1163
#19 0x00007ffff79ed97f in _reactor_run (reactor=0x7ffff0008e00) at reactor.c:914
#20 0x00007ffff79eda77 in reactor_run (arg=0x7ffff0008e00) at reactor.c:952
#21 0x00007ffff79edf25 in spdk_reactors_start () at reactor.c:1068
#22 0x00007ffff79e9d7c in spdk_app_start (opts_user=0x7ffff65fd9f0, start_fn=0x5555555a3504 <leveldb::start_fn(void*)>, arg1=0x5555555fce10) at app.c:839
#23 0x00005555555a391e in leveldb::AppStart (context=0x5555555fce10) at /root/leveldb-spdk-env/zns_spdk_env/spdk_api.cc:83
--Type <RET> for more, q to quit, c to continue without paging--
#24 0x0000555555593023 in operator() (__closure=0x5555555ca9a8) at /root/leveldb-spdk-env/zns_spdk_env/filesystem.cc:194
#25 0x000055555559419c in std::__invoke_impl<void, leveldb::ZnsSpdkEnv::ZnsSpdkEnv(leveldb::Env*)::<lambda()> >(std::__invoke_other, struct {...} &&) (__f=...) at /usr/include/c++/11/bits/invoke.h:61
#26 0x0000555555594151 in std::__invoke<leveldb::ZnsSpdkEnv::ZnsSpdkEnv(leveldb::Env*)::<lambda()> >(struct {...} &&) (__fn=...) at /usr/include/c++/11/bits/invoke.h:96
#27 0x00005555555940fe in std::thread::_Invoker<std::tuple<leveldb::ZnsSpdkEnv::ZnsSpdkEnv(leveldb::Env*)::<lambda()> > >::_M_invoke<0>(std::_Index_tuple<0>) (this=0x5555555ca9a8) at /usr/include/c++/11/bits/std_thread.h:259
#28 0x00005555555940d2 in std::thread::_Invoker<std::tuple<leveldb::ZnsSpdkEnv::ZnsSpdkEnv(leveldb::Env*)::<lambda()> > >::operator()(void) (this=0x5555555ca9a8) at /usr/include/c++/11/bits/std_thread.h:266
#29 0x00005555555940b6 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<leveldb::ZnsSpdkEnv::ZnsSpdkEnv(leveldb::Env*)::<lambda()> > > >::_M_run(void) (this=0x5555555ca9a0) at /usr/include/c++/11/bits/std_thread.h:211
#30 0x00007ffff70dc253 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#31 0x00007ffff6c94ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
--Type <RET> for more, q to quit, c to continue without paging--
#32 0x00007ffff6d26660 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

通过bdev_nvme_poll轮询函数倒退找到注册poller过程

1
2
3
4
5
6
7
8
9
10
11
12
static int
bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf)
{
struct nvme_poll_group *group = ctx_buf;

TAILQ_INIT(&group->qpair_list);

group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table);
group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);

return 0;
}
1
2
3
4
5
6
7
8
9
10
11
static int
bdev_nvme_library_init(void)
{
g_bdev_nvme_init_thread = spdk_get_thread();

spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb,
bdev_nvme_destroy_poll_group_cb,
sizeof(struct nvme_poll_group), "nvme_poll_groups");

return 0;
}

通过在bdev_nvme_create_poll_group_cb函数打断点,找到调用栈

冗长的调用栈

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
(gdb) bt
#0 bdev_nvme_create_poll_group_cb (io_device=0x7ffff75ff1e0 <g_nvme_bdev_ctrlrs>, ctx_buf=0x7ffff00ae9e0) at bdev_nvme.c:3383
#1 0x00007ffff737cea1 in spdk_get_io_channel (io_device=0x7ffff75ff1e0 <g_nvme_bdev_ctrlrs>) at thread.c:2353
#2 0x00007ffff75d91da in nvme_qpair_create (nvme_ctrlr=0x7ffff00957c0, ctrlr_ch=0x7ffff00ae900) at bdev_nvme.c:3181
#3 0x00007ffff75d9359 in bdev_nvme_create_ctrlr_channel_cb (io_device=0x7ffff00957c0, ctx_buf=0x7ffff00ae900) at bdev_nvme.c:3235
#4 0x00007ffff737cea1 in spdk_get_io_channel (io_device=0x7ffff00957c0) at thread.c:2353
#5 0x00007ffff75d3ab2 in _bdev_nvme_add_io_path (nbdev_ch=0x7ffff00ae810, nvme_ns=0x7ffff0096f80) at bdev_nvme.c:651
#6 0x00007ffff75d3f46 in bdev_nvme_create_bdev_channel_cb (io_device=0x7ffff0097000, ctx_buf=0x7ffff00ae810) at bdev_nvme.c:756
#7 0x00007ffff737cea1 in spdk_get_io_channel (io_device=0x7ffff0097000) at thread.c:2353
#8 0x00007ffff75d99f1 in bdev_nvme_get_io_channel (ctx=0x7ffff0097000) at bdev_nvme.c:3425
#9 0x00007ffff79202cc in bdev_channel_create (io_device=0x7ffff0097001, ctx_buf=0x7ffff00ae6e0) at bdev.c:4024
#10 0x00007ffff737cea1 in spdk_get_io_channel (io_device=0x7ffff0097001) at thread.c:2353
#11 0x00007ffff7921d5a in spdk_bdev_get_io_channel (desc=0x7ffff0097b50) at bdev.c:4643
#12 0x00007ffff79442dd in bdev_blob_create_channel (dev=0x7ffff0097a70) at blob_bdev.c:353
#13 0x00007ffff73aabf9 in bs_channel_create (io_device=0x7ffff00980c0, ctx_buf=0x7ffff0099600) at blobstore.c:3331
#14 0x00007ffff737cea1 in spdk_get_io_channel (io_device=0x7ffff00980c0) at thread.c:2353
#15 0x00007ffff73b18a8 in bs_register_md_thread (bs=0x7ffff00980c0) at blobstore.c:5805
#16 0x00007ffff73aba72 in bs_alloc (dev=0x7ffff0097a70, opts=0x7ffff65fcf90, _bs=0x7ffff65fcf50, _ctx=0x7ffff65fcf58) at blobstore.c:3684
#17 0x00007ffff73aea8c in spdk_bs_load (dev=0x7ffff0097a70, o=0x7ffff65fd040, cb_fn=0x7ffff74e3a35 <lvs_load_cb>, cb_arg=0x7ffff0097f60) at blobstore.c:4797
#18 0x00007ffff74e3e73 in lvs_load (bs_dev=0x7ffff0097a70, _lvs_opts=0x7ffff65fd160, cb_fn=0x7ffff788ece8 <_vbdev_lvs_examine_cb>, cb_arg=0x7ffff0097a30) at lvol.c:474
#19 0x00007ffff74e3ef1 in spdk_lvs_load_ext (bs_dev=0x7ffff0097a70, opts=0x7ffff65fd160, cb_fn=0x7ffff788ece8 <_vbdev_lvs_examine_cb>, cb_arg=0x7ffff0097a30) at lvol.c:487
#20 0x00007ffff788f21d in vbdev_lvs_load (bs_dev=0x7ffff0097a70, cb_fn=0x7ffff788ece8 <_vbdev_lvs_examine_cb>, cb_arg=0x7ffff0097a30) at vbdev_lvol.c:1719
#21 0x00007ffff788f18c in _vbdev_lvs_examine (bdev=0x7ffff0097000, ori_req=0x7ffff0097790, action=0x7ffff788f1c7 <vbdev_lvs_load>) at vbdev_lvol.c:1700
#22 0x00007ffff788f33f in vbdev_lvs_examine_disk (bdev=0x7ffff0097000) at vbdev_lvol.c:1744
#23 0x00007ffff7917e09 in bdev_examine (bdev=0x7ffff0097000) at bdev.c:716
#24 0x00007ffff792a940 in spdk_bdev_register (bdev=0x7ffff0097000) at bdev.c:8378
#25 0x00007ffff75db8e6 in nvme_bdev_create (nvme_ctrlr=0x7ffff00957c0, nvme_ns=0x7ffff0096f80) at bdev_nvme.c:4180
#26 0x00007ffff75dc56b in nvme_ctrlr_populate_namespace (nvme_ctrlr=0x7ffff00957c0, nvme_ns=0x7ffff0096f80) at bdev_nvme.c:4480
#27 0x00007ffff75dcac1 in nvme_ctrlr_populate_namespaces (nvme_ctrlr=0x7ffff00957c0, ctx=0x7ffff0086c00) at bdev_nvme.c:4639
#28 0x00007ffff75ddb43 in nvme_ctrlr_create_done (nvme_ctrlr=0x7ffff00957c0, ctx=0x7ffff0086c00) at bdev_nvme.c:5115
#29 0x00007ffff75de586 in nvme_ctrlr_create (ctrlr=0x20000b21fa00, name=0x7ffff000d480 "Nvme0", trid=0x7ffff0086c28, ctx=0x7ffff0086c00) at bdev_nvme.c:5357
#30 0x00007ffff75df676 in connect_attach_cb (cb_ctx=0x7ffff0087e70, trid=0x20000b21fa28, ctrlr=0x20000b21fa00, opts=0x20000b221008) at bdev_nvme.c:5805
#31 0x00007ffff74a9510 in nvme_ctrlr_poll_internal (ctrlr=0x20000b21fa00, probe_ctx=0x7ffff00881d0) at nvme.c:743
#32 0x00007ffff74ab7bc in spdk_nvme_probe_poll_async (probe_ctx=0x7ffff00881d0) at nvme.c:1516
#33 0x00007ffff75df763 in bdev_nvme_async_poll (arg=0x7ffff0086c00) at bdev_nvme.c:5842
#34 0x00007ffff7379a5f in thread_execute_timed_poller (thread=0x7ffff00093e0, poller=0x7ffff00955f0, now=1638652381357701) at thread.c:1014
--Type <RET> for more, q to quit, c to continue without paging--
#35 0x00007ffff7379d8a in thread_poll (thread=0x7ffff00093e0, max_msgs=0, now=1638652381357701) at thread.c:1104
#36 0x00007ffff7379fb7 in spdk_thread_poll (thread=0x7ffff00093e0, max_msgs=0, now=1638652381357701) at thread.c:1163
#37 0x00007ffff79ed97f in _reactor_run (reactor=0x7ffff0008e00) at reactor.c:914
#38 0x00007ffff79eda77 in reactor_run (arg=0x7ffff0008e00) at reactor.c:952
#39 0x00007ffff79edf25 in spdk_reactors_start () at reactor.c:1068
#40 0x00007ffff79e9d7c in spdk_app_start (opts_user=0x7ffff65fd9f0, start_fn=0x5555555a3504 <leveldb::start_fn(void*)>, arg1=0x5555555fce10) at app.c:839
#41 0x00005555555a391e in leveldb::AppStart (context=0x5555555fce10) at /root/leveldb-spdk-env/zns_spdk_env/spdk_api.cc:83
#42 0x0000555555593023 in operator() (__closure=0x5555555ca9a8) at /root/leveldb-spdk-env/zns_spdk_env/filesystem.cc:194
#43 0x000055555559419c in std::__invoke_impl<void, leveldb::ZnsSpdkEnv::ZnsSpdkEnv(leveldb::Env*)::<lambda()> >(std::__invoke_other, struct {...} &&) (__f=...) at /usr/include/c++/11/bits/invoke.h:61
#44 0x0000555555594151 in std::__invoke<leveldb::ZnsSpdkEnv::ZnsSpdkEnv(leveldb::Env*)::<lambda()> >(struct {...} &&) (__fn=...) at /usr/include/c++/11/bits/invoke.h:96
#45 0x00005555555940fe in std::thread::_Invoker<std::tuple<leveldb::ZnsSpdkEnv::ZnsSpdkEnv(leveldb::Env*)::<lambda()> > >::_M_invoke<0>(std::_Index_tuple<0>) (this=0x5555555ca9a8) at /usr/include/c++/11/bits/std_thread.h:259
#46 0x00005555555940d2 in std::thread::_Invoker<std::tuple<leveldb::ZnsSpdkEnv::ZnsSpdkEnv(leveldb::Env*)::<lambda()> > >::operator()(void) (this=0x5555555ca9a8) at /usr/include/c++/11/bits/std_thread.h:266
#47 0x00005555555940b6 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<leveldb::ZnsSpdkEnv::ZnsSpdkEnv(leveldb::Env*)::<lambda()> > > >::_M_run(void) (this=0x5555555ca9a0) at /usr/include/c++/11/bits/std_thread.h:211
#48 0x00007ffff70dc253 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#49 0x00007ffff6c94ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#50 0x00007ffff6d26660 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

只看前面部分的调用栈

根据调用栈找到如下代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
static int
bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf)
{
struct nvme_bdev_channel *nbdev_ch = ctx_buf;
struct nvme_bdev *nbdev = io_device;
struct nvme_ns *nvme_ns;
int rc;

STAILQ_INIT(&nbdev_ch->io_path_list);
TAILQ_INIT(&nbdev_ch->retry_io_list);

pthread_mutex_lock(&nbdev->mutex);

nbdev_ch->mp_policy = nbdev->mp_policy;
nbdev_ch->mp_selector = nbdev->mp_selector;
nbdev_ch->rr_min_io = nbdev->rr_min_io;

TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
}
pthread_mutex_unlock(&nbdev->mutex);

return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static int
_bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns)
{
struct nvme_io_path *io_path;
struct spdk_io_channel *ch;
struct nvme_ctrlr_channel *ctrlr_ch;
struct nvme_qpair *nvme_qpair;

io_path = nvme_io_path_alloc();


io_path->nvme_ns = nvme_ns;

ch = spdk_get_io_channel(nvme_ns->ctrlr); // 调用spdk_get_io_channel函数

ctrlr_ch = spdk_io_channel_get_ctx(ch);

nvme_qpair = ctrlr_ch->qpair;
assert(nvme_qpair != NULL);

io_path->qpair = nvme_qpair;
TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq);

io_path->nbdev_ch = nbdev_ch;
STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq);

bdev_nvme_clear_current_io_path(nbdev_ch);

return 0;
}

spdk_get_io_channel的create_cb为bdev_nvme_create_ctrlr_channel_cb函数

1
2
3
4
5
6
7
8
9
10
static int
bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf)
{
struct nvme_ctrlr *nvme_ctrlr = io_device;
struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;

TAILQ_INIT(&ctrlr_ch->pending_resets);

return nvme_qpair_create(nvme_ctrlr, ctrlr_ch);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
static int
nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch)
{
struct nvme_qpair *nvme_qpair;
struct spdk_io_channel *pg_ch;
int rc;

nvme_qpair = calloc(1, sizeof(*nvme_qpair));

TAILQ_INIT(&nvme_qpair->io_path_list);

nvme_qpair->ctrlr = nvme_ctrlr;
nvme_qpair->ctrlr_ch = ctrlr_ch;

pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); // 在此调用spdk_get_io_channel,从而调用bdev_nvme_create_poll_group_cb函数

nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch);


if (!nvme_ctrlr->disabled) {
/* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will
* be created when it's enabled.
*/
rc = bdev_nvme_create_qpair(nvme_qpair); // 创建qpair,连接poll group
}

TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq);

ctrlr_ch->qpair = nvme_qpair;

pthread_mutex_lock(&nvme_qpair->ctrlr->mutex);
nvme_qpair->ctrlr->ref++;
pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex);

return 0;
}

根据以上函数可以看出,每个channel都有对应的poll group,并且有对应的qpair,所以不同thread的收发命令互不干扰

回看最开始的poll函数

1
2
3
4
5
6
7
8
9
static int
bdev_nvme_poll(void *arg)
{
struct nvme_poll_group *group = arg;
int64_t num_completions;
num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
bdev_nvme_disconnected_qpair_cb);
return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
}

相关博客:spdk-20.10 io_channel 和 轮询 group的机制分析

spdk_thread分析

对照博客:[转]spdk线程模型 spdk_thread简单看看reactor thread的相关代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
void
spdk_reactors_start(void)
{
struct spdk_reactor *reactor;
uint32_t i, current_core;
int rc;

g_rusage_period = (CONTEXT_SWITCH_MONITOR_PERIOD * spdk_get_ticks_hz()) / SPDK_SEC_TO_USEC;
g_reactor_state = SPDK_REACTOR_STATE_RUNNING;
/* Reinitialize to false, in case the app framework is restarting in the same process. */
g_stopping_reactors = false;

current_core = spdk_env_get_current_core();
SPDK_ENV_FOREACH_CORE(i) {
if (i != current_core) {
reactor = spdk_reactor_get(i);
if (reactor == NULL) {
continue;
}

rc = spdk_env_thread_launch_pinned(reactor->lcore, reactor_run, reactor);
}
spdk_cpuset_set_cpu(&g_reactor_core_mask, i, true);
}

/* Start the main reactor */
reactor = spdk_reactor_get(current_core);
reactor_run(reactor); // 主要运行的函数

spdk_env_thread_wait_all();

g_reactor_state = SPDK_REACTOR_STATE_SHUTDOWN;
}

每个reactor都运行reactor_run函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
// 只看频繁运行的代码,不看spdk_unlikely分支与结束处理代码
static int
reactor_run(void *arg)
{
struct spdk_reactor *reactor = arg;
struct spdk_thread *thread;
struct spdk_lw_thread *lw_thread, *tmp;
char thread_name[32];
uint64_t last_sched = 0;

while (1) {
_reactor_run(reactor); // 进入实际执行函数
if (g_reactor_state != SPDK_REACTOR_STATE_RUNNING) {
break;
}
}

TAILQ_FOREACH(lw_thread, &reactor->threads, link) {
thread = spdk_thread_get_from_ctx(lw_thread);
/* All threads should have already had spdk_thread_exit() called on them, except
* for the app thread.
*/
if (spdk_thread_is_running(thread)) {
spdk_set_thread(thread);
spdk_thread_exit(thread);
}
}
return 0;
}

struct spdk_reactor {
/* Lightweight threads running on this reactor */
TAILQ_HEAD(, spdk_lw_thread) threads;
uint32_t thread_count;

/* Logical core number for this reactor. */
uint32_t lcore;

struct spdk_ring *events;

} __attribute__((aligned(SPDK_CACHE_LINE_SIZE)));

static void
_reactor_run(struct spdk_reactor *reactor)
{
struct spdk_thread *thread;
struct spdk_lw_thread *lw_thread, *tmp;
uint64_t now;
int rc;

event_queue_run_batch(reactor); // 处理reactor相应事件

TAILQ_FOREACH_SAFE(lw_thread, &reactor->threads, link, tmp) {
thread = spdk_thread_get_from_ctx(lw_thread);
rc = spdk_thread_poll(thread, 0, reactor->tsc_last); // 运行reactor上各个thread的事件(msg,poller)
reactor_post_process_lw_thread(reactor, lw_thread);
}
}

int
spdk_thread_poll(struct spdk_thread *thread, uint32_t max_msgs, uint64_t now)
{
struct spdk_thread *orig_thread;
int rc;

orig_thread = _get_thread();
tls_thread = thread;

rc = thread_poll(thread, max_msgs, now); // 进入实际的执行函数

thread_update_stats(thread, spdk_get_ticks(), now, rc);

tls_thread = orig_thread;

return rc;
}

struct spdk_thread {
/*
* Contains pollers actively running on this thread. Pollers
* are run round-robin. The thread takes one poller from the head
* of the ring, executes it, then puts it back at the tail of
* the ring.
*/
TAILQ_HEAD(active_pollers_head, spdk_poller) active_pollers;
/**
* Contains pollers running on this thread with a periodic timer.
*/
RB_HEAD(timed_pollers_tree, spdk_poller) timed_pollers;
struct spdk_poller *first_timed_poller;
/*
* Contains paused pollers. Pollers on this queue are waiting until
* they are resumed (in which case they're put onto the active/timer
* queues) or unregistered.
*/
TAILQ_HEAD(paused_pollers_head, spdk_poller) paused_pollers;
struct spdk_ring *messages;
};

static int
thread_poll(struct spdk_thread *thread, uint32_t max_msgs, uint64_t now)
{
uint32_t msg_count;
struct spdk_poller *poller, *tmp;
spdk_msg_fn critical_msg;
int rc = 0;

thread->tsc_last = now;
// 先处理msg
msg_count = msg_queue_run_batch(thread, max_msgs);
if (msg_count) {
rc = 1;
}
// 再处理活跃poller
TAILQ_FOREACH_REVERSE_SAFE(poller, &thread->active_pollers,
active_pollers_head, tailq, tmp) {
int poller_rc;

poller_rc = thread_execute_poller(thread, poller);
if (poller_rc > rc) {
rc = poller_rc;
}
}
// 最后处理定时poller
poller = thread->first_timed_poller;
while (poller != NULL) {
int timer_rc = 0;

if (now < poller->next_run_tick) {
break;
}

tmp = RB_NEXT(timed_pollers_tree, &thread->timed_pollers, poller);
RB_REMOVE(timed_pollers_tree, &thread->timed_pollers, poller);

/* Update the cache to the next timed poller in the list
* only if the current poller is still the closest, otherwise,
* do nothing because the cache has been already updated.
*/
if (thread->first_timed_poller == poller) {
thread->first_timed_poller = tmp;
}

timer_rc = thread_execute_timed_poller(thread, poller, now);
if (timer_rc > rc) {
rc = timer_rc;
}

poller = tmp;
}

return rc;
}

再看看spdk_thread如何绑定到reactor上的

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
struct spdk_thread *
spdk_thread_create(const char *name, const struct spdk_cpuset *cpumask)
{
struct spdk_thread *thread, *null_thread;
struct spdk_msg *msgs[SPDK_MSG_MEMPOOL_CACHE_SIZE];
int rc = 0, i;

thread = calloc(1, sizeof(*thread) + g_ctx_sz);
if (cpumask) {
spdk_cpuset_copy(&thread->cpumask, cpumask);
} else {
spdk_cpuset_negate(&thread->cpumask);
}

RB_INIT(&thread->io_channels);
TAILQ_INIT(&thread->active_pollers);
RB_INIT(&thread->timed_pollers);
TAILQ_INIT(&thread->paused_pollers);
SLIST_INIT(&thread->msg_cache);
thread->msg_cache_count = 0;

thread->tsc_last = spdk_get_ticks();

/* Monotonic increasing ID is set to each created poller beginning at 1. Once the
* ID exceeds UINT64_MAX a warning message is logged
*/
thread->next_poller_id = 1;

thread->messages = spdk_ring_create(SPDK_RING_TYPE_MP_SC, 65536, SPDK_ENV_SOCKET_ID_ANY);


if (name) {
snprintf(thread->name, sizeof(thread->name), "%s", name);
} else {
snprintf(thread->name, sizeof(thread->name), "%p", thread);
}

if (g_new_thread_fn) {
rc = g_new_thread_fn(thread);
} else if (g_thread_op_supported_fn && g_thread_op_supported_fn(SPDK_THREAD_OP_NEW)) {
rc = g_thread_op_fn(thread, SPDK_THREAD_OP_NEW);
}

if (rc != 0) {
_free_thread(thread);
return NULL;
}

thread->state = SPDK_THREAD_STATE_RUNNING;

/* If this is the first thread, save it as the app thread. Use an atomic
* compare + exchange to guard against crazy users who might try to
* call spdk_thread_create() simultaneously on multiple threads.
*/
null_thread = NULL;
__atomic_compare_exchange_n(&g_app_thread, &null_thread, thread, false,
__ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);

return thread;
}

只需关注以下代码片段

1
2
3
4
5
if (g_new_thread_fn) {
rc = g_new_thread_fn(thread);
} else if (g_thread_op_supported_fn && g_thread_op_supported_fn(SPDK_THREAD_OP_NEW)) {
rc = g_thread_op_fn(thread, SPDK_THREAD_OP_NEW);
}

倒退找到g_thread_op_fn赋值过程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
int
spdk_thread_lib_init_ext(spdk_thread_op_fn thread_op_fn,
spdk_thread_op_supported_fn thread_op_supported_fn,
size_t ctx_sz, size_t msg_mempool_sz)
{
g_thread_op_fn = thread_op_fn;
g_thread_op_supported_fn = thread_op_supported_fn;

return _thread_lib_init(ctx_sz, msg_mempool_sz);
}

int
spdk_reactors_init(size_t msg_mempool_size)
{

rc = spdk_thread_lib_init_ext(reactor_thread_op, reactor_thread_op_supported,
sizeof(struct spdk_lw_thread), msg_mempool_size);
g_reactor_state = SPDK_REACTOR_STATE_INITIALIZED;

return 0;
}

g_thread_op_fn即为reactor_thread_op函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
static int
reactor_thread_op(struct spdk_thread *thread, enum spdk_thread_op op)
{
struct spdk_lw_thread *lw_thread;

switch (op) {
case SPDK_THREAD_OP_NEW:
lw_thread = spdk_thread_get_ctx(thread);
lw_thread->lcore = SPDK_ENV_LCORE_ID_ANY;
return _reactor_schedule_thread(thread);
case SPDK_THREAD_OP_RESCHED:
_reactor_request_thread_reschedule(thread);
return 0;
default:
return -ENOTSUP;
}
}

static int
_reactor_schedule_thread(struct spdk_thread *thread)
{
uint32_t core;
struct spdk_lw_thread *lw_thread;
struct spdk_event *evt = NULL;
struct spdk_cpuset *cpumask;
uint32_t i;
struct spdk_reactor *local_reactor = NULL;
uint32_t current_lcore = spdk_env_get_current_core();
struct spdk_cpuset polling_cpumask;
struct spdk_cpuset valid_cpumask;

cpumask = spdk_thread_get_cpumask(thread);

lw_thread = spdk_thread_get_ctx(thread);
assert(lw_thread != NULL);
core = lw_thread->lcore;
memset(lw_thread, 0, sizeof(*lw_thread));

if (current_lcore != SPDK_ENV_LCORE_ID_ANY) {
local_reactor = spdk_reactor_get(current_lcore);
assert(local_reactor);
}

/* When interrupt ability of spdk_thread is not enabled and the current
* reactor runs on DPDK thread, skip reactors which are in interrupt mode.
*/
if (!spdk_interrupt_mode_is_enabled() && local_reactor != NULL) {
/* Get the cpumask of all reactors in polling */
spdk_cpuset_zero(&polling_cpumask);
SPDK_ENV_FOREACH_CORE(i) {
spdk_cpuset_set_cpu(&polling_cpumask, i, true);
}
spdk_cpuset_xor(&polling_cpumask, &local_reactor->notify_cpuset);

if (core == SPDK_ENV_LCORE_ID_ANY) {
/* Get the cpumask of all valid reactors which are suggested and also in polling */
spdk_cpuset_copy(&valid_cpumask, &polling_cpumask);
spdk_cpuset_and(&valid_cpumask, spdk_thread_get_cpumask(thread));

/* If there are any valid reactors, spdk_thread should be scheduled
* into one of the valid reactors.
* If there is no valid reactors, spdk_thread should be scheduled
* into one of the polling reactors.
*/
if (spdk_cpuset_count(&valid_cpumask) != 0) {
cpumask = &valid_cpumask;
} else {
cpumask = &polling_cpumask;
}
} else if (!spdk_cpuset_get_cpu(&polling_cpumask, core)) {
/* If specified reactor is not in polling, spdk_thread should be scheduled
* into one of the polling reactors.
*/
core = SPDK_ENV_LCORE_ID_ANY;
cpumask = &polling_cpumask;
}
}

pthread_mutex_lock(&g_scheduler_mtx);
if (core == SPDK_ENV_LCORE_ID_ANY) {
for (i = 0; i < spdk_env_get_core_count(); i++) {
if (g_next_core >= g_reactor_count) {
g_next_core = spdk_env_get_first_core();
}
core = g_next_core;
g_next_core = spdk_env_get_next_core(g_next_core);

if (spdk_cpuset_get_cpu(cpumask, core)) {
break;
}
}
}
// 创建事件
evt = spdk_event_allocate(core, _schedule_thread, lw_thread, NULL);

pthread_mutex_unlock(&g_scheduler_mtx);

assert(evt != NULL);
if (evt == NULL) {
SPDK_ERRLOG("Unable to schedule thread on requested core mask.\n");
return -1;
}

lw_thread->tsc_start = spdk_get_ticks();
// 将event发送到对应的reactor,到时候由_reactor_run函数中的event_queue_run_batch取出并执行
spdk_event_call(evt); // Pass the given event to the associated lcore and call the function.

return 0;
}

重点放在_schedule_thread函数中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
static void
_schedule_thread(void *arg1, void *arg2)
{
struct spdk_lw_thread *lw_thread = arg1;
struct spdk_thread *thread;
struct spdk_reactor *reactor;
uint32_t current_core;
struct spdk_fd_group *grp;

current_core = spdk_env_get_current_core();
reactor = spdk_reactor_get(current_core);
assert(reactor != NULL);

/* Update total_stats to reflect state of thread
* at the end of the move. */
thread = spdk_thread_get_from_ctx(lw_thread);
spdk_set_thread(thread);
spdk_thread_get_stats(&lw_thread->total_stats);
spdk_set_thread(NULL);

lw_thread->lcore = current_core;

TAILQ_INSERT_TAIL(&reactor->threads, lw_thread, link); // 插入reactor对应的thread列表
reactor->thread_count++;
}