eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN);
if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; } elseif (unlikely(!ether_addr_equal_64bits(eth->h_dest, dev->dev_addr))) skb->pkt_type = PACKET_OTHERHOST;
/* * Some variants of DSA tagging don't have an ethertype field * at all, so we check here whether one of those tagging * variants has been configured on the receiving interface, * and if so, set skb->protocol without looking at the packet. */ if (unlikely(netdev_uses_dsa(dev))) return htons(ETH_P_XDSA);
if (likely(eth_proto_is_802_3(eth->h_proto))) return eth->h_proto;
/* * This is a magic hack to spot IPX packets. Older Novell breaks * the protocol design and runs IPX over 802.3 without an 802.2 LLC * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This * won't work for fault tolerant netware but does for the rest. */ sap = skb_header_pointer(skb, 0, sizeof(*sap), &_service_access_point); if (sap && *sap == 0xFFFF) return htons(ETH_P_802_3);
/* * Real 802.2 LLC */ return htons(ETH_P_802_2); }
/** * netif_napi_add() - initialize a NAPI context * @dev: network device * @napi: NAPI context * @poll: polling function * @weight: default weight * * netif_napi_add() must be used to initialize a NAPI context prior to calling * *any* of the other NAPI-related functions. */ staticinlinevoid netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight)
/** * napi_enable - enable NAPI scheduling * @n: NAPI context * * Resume NAPI from being scheduled on this context. * Must be paired with napi_disable. */ voidnapi_enable(struct napi_struct *n)
/** * napi_schedule - schedule NAPI poll * @n: NAPI context * * Schedule NAPI poll routine to be called if it is not already * running. */ staticinlinevoidnapi_schedule(struct napi_struct *n) { if (napi_schedule_prep(n)) __napi_schedule(n); }
/** * napi_schedule_irqoff - schedule NAPI poll * @n: NAPI context * * Variant of napi_schedule(), assuming hard irqs are masked. */ staticinlinevoidnapi_schedule_irqoff(struct napi_struct *n) { if (napi_schedule_prep(n)) __napi_schedule_irqoff(n); }
/** * napi_schedule_prep - check if napi can be scheduled * @n: napi context * * Test if NAPI routine is already running, and if not mark * it as running. This is used as a condition variable to * insure only one NAPI poll instance runs. We also make * sure there is no pending NAPI disable. */ boolnapi_schedule_prep(struct napi_struct *n)
/** * napi_complete - NAPI processing complete * @n: NAPI context * * Mark NAPI processing as complete. * Consider using napi_complete_done() instead. * Return false if device should avoid rearming interrupts. */ staticinlineboolnapi_complete(struct napi_struct *n) { return napi_complete_done(n, 0); }
staticinlinestruct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsignedint length) { return __napi_alloc_skb(napi, length, GFP_ATOMIC); } /** * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance * @napi: napi instance this buffer was allocated for * @len: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages * * Allocate a new sk_buff for use in NAPI receive. This buffer will * attempt to allocate the head from a special reserved region used * only for NAPI Rx allocation. By doing this we can save several * CPU cycles by avoiding having to disable and re-enable IRQs. * * %NULL is returned if there is no free memory. */ structsk_buff *__napi_alloc_skb(structnapi_struct *napi, unsignedintlen, gfp_tgfp_mask)
/** * netif_rx - post buffer to the network code * @skb: buffer to post * * This function receives a packet from a device driver and queues it for * the upper (protocol) levels to process via the backlog NAPI device. It * always succeeds. The buffer may be dropped during processing for * congestion control or by the protocol layers. * The network buffer is passed via the backlog NAPI device. Modern NIC * driver should use NAPI and GRO. * This function can used from interrupt and from process context. The * caller from process context must not disable interrupts before invoking * this function. * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped) * */ intnetif_rx(struct sk_buff *skb)
/** * netif_receive_skb - process receive buffer from network * @skb: buffer to process * * netif_receive_skb() is the main receive data processing function. * It always succeeds. The buffer may be dropped during processing * for congestion control or by the protocol layers. * * This function may only be called from softirq context and interrupts * should be enabled. * * Return values (usually ignored): * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ intnetif_receive_skb(struct sk_buff *skb)
/** * netif_receive_skb_core - special purpose version of netif_receive_skb * @skb: buffer to process * * More direct receive version of netif_receive_skb(). It should * only be used by callers that have a need to skip RPS and Generic XDP. * Caller must also take care of handling if ``(page_is_)pfmemalloc``. * * This function may only be called from softirq context and interrupts * should be enabled. * * Return values (usually ignored): * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ intnetif_receive_skb_core(struct sk_buff *skb)
/** * is_valid_ether_addr - Determine if the given Ethernet address is valid * @addr: Pointer to a six-byte array containing the Ethernet address * * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not * a multicast address, and is not FF:FF:FF:FF:FF:FF. * * Return true if the address is valid. * * Please note: addr must be aligned to u16. */ staticinlineboolis_valid_ether_addr(const u8 *addr) { /* FF:FF:FF:FF:FF:FF is a multicast address so we don't need to * explicitly check for it here. */ return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr); }
/** * eth_random_addr - Generate software assigned random Ethernet address * @addr: Pointer to a six-byte array containing the Ethernet address * * Generate a random Ethernet address (MAC) that is not multicast * and has the local assigned bit set. */ staticinlinevoideth_random_addr(u8 *addr) { get_random_bytes(addr, ETH_ALEN); addr[0] &= 0xfe; /* clear multicast bit */ addr[0] |= 0x02; /* set local assignment bit (IEEE802) */ }
// 定时器 structtimer_list { /* * All fields that change during normal runtime grouped to the * same cacheline */ structhlist_nodeentry; unsignedlong expires; // 过期时间 void (*function)(struct timer_list *); u32 flags;
Options: -C, --clear clear the kernel ring buffer -c, --read-clear read and clear all messages -D, --console-off disable printing messages to console -E, --console-on enable printing messages to console -F, --file <file> use the file instead of the kernel log buffer -f, --facility <list> restrict output to defined facilities -H, --human human readable output -k, --kernel display kernel messages -L, --color[=<when>] colorize messages (auto, always or never) colors are enabled by default -l, --level <list> restrict output to defined levels -n, --console-level <level> set level of messages printed to console -P, --nopager do not pipe output into a pager -p, --force-prefix force timestamp output on each line of multi-line messages -r, --raw print the raw message buffer -S, --syslog force to use syslog(2) rather than /dev/kmsg -s, --buffer-size <size> buffer size to query the kernel ring buffer -u, --userspace display userspace messages -w, --follow waitfor new messages -x, --decode decode facility and level to readable string -d, --show-delta show time delta between printed messages -e, --reltime show local time and time delta in readable format -T, --ctime show human-readable timestamp (may be inaccurate!) -t, --notime don't show any timestamp with messages --time-format <format> show timestamp using the given format: [delta|reltime|ctime|notime|iso] Suspending/resume will make ctime and iso timestamps inaccurate.
-h, --help display this help -V, --version display version
Supported log facilities: kern - kernel messages user - random user-level messages mail - mail system daemon - system daemons auth - security/authorization messages syslog - messages generated internally by syslogd lpr - line printer subsystem news - network news subsystem
Supported log levels (priorities): emerg - system is unusable alert - action must be taken immediately crit - critical conditions err - error conditions warn - warning conditions notice - normal but significant condition info - informational debug - debug-level messages
mkfs.ext4 -b 65536 /dev/nvme1n1p1 Warning: blocksize 65536 not usable on most systems. mke2fs 1.45.5 (07-Jan-2020) /dev/nvme1n1p1 contains a ext4 file system created on Fri Apr 21 17:00:47 2023 Proceed anyway? (y,N) y mkfs.ext4: 65536-byte blocks too big for system (max 4096) Proceed anyway? (y,N) y Warning: 65536-byte blocks too big for system (max 4096), forced to continue Creating filesystem with 163824 64k blocks and 164352 inodes Filesystem UUID: 0d389dff-8d36-43b1-8362-2482ef7002b7 Superblock backups stored on blocks: 65528
Allocating group tables: done Writing inode tables: done Creating journal (4096 blocks): done Writing superblocks and filesystem accounting information: done
本来以为大功告成了,但挂载时出错
1 2 3 4 5
mount -t ext4 /dev/nvme1n1p1 test mount: test: wrong fs type, bad option, bad superblock on /dev/nvme1n1p1, missing codepage or helper program, or other error.
dmesg输出 [18738.045713] EXT4-fs (nvme1n1p1): bad block size 65536
tldr dumpe2fs dumpe2fs Print the super block and blocks group information for ext2/ext3/ext4 filesystems.Unmount the partition before running this command using umount {{device}}.More information: https://manned.org/dumpe2fs.
- Display ext2, ext3 and ext4 filesystem information: dumpe2fs {{/dev/sdXN}}
- Display the blocks which are reserved as bad in the filesystem: dumpe2fs -b {{/dev/sdXN}}
- Force display filesystem information even with unrecognizable feature flags: dumpe2fs -f {{/dev/sdXN}}
- Only display the superblock information and not any of the block group descriptor detail information: dumpe2fs -h {{/dev/sdXN}}
- Print the detailed group information block numbers in hexadecimal format: dumpe2fs -x {{/dev/sdXN}}
/** * struct pci_driver - PCI driver structure * @node: List of driver structures. * @name: Driver name. * @id_table: Pointer to table of device IDs the driver is * interested in. Most drivers should export this * table using MODULE_DEVICE_TABLE(pci,...). * @probe: This probing function gets called (during execution * of pci_register_driver() for already existing * devices or later if a new device gets inserted) for * all PCI devices which match the ID table and are not * "owned" by the other drivers yet. This function gets * passed a "struct pci_dev \*" for each device whose * entry in the ID table matches the device. The probe * function returns zero when the driver chooses to * take "ownership" of the device or an error code * (negative number) otherwise. * The probe function always gets called from process * context, so it can sleep. * @remove: The remove() function gets called whenever a device * being handled by this driver is removed (either during * deregistration of the driver or when it's manually * pulled out of a hot-pluggable slot). * The remove function always gets called from process * context, so it can sleep. * @suspend: Put device into low power state. * @suspend_late: Put device into low power state. * @resume_early: Wake device from low power state. * @resume: Wake device from low power state. * (Please see Documentation/power/pci.rst for descriptions * of PCI Power Management and the related functions.) * @shutdown: Hook into reboot_notifier_list (kernel/sys.c). * Intended to stop any idling DMA operations. * Useful for enabling wake-on-lan (NIC) or changing * the power state of a device before reboot. * e.g. drivers/net/e100.c. * @sriov_configure: Optional driver callback to allow configuration of * number of VFs to enable via sysfs "sriov_numvfs" file. * @err_handler: See Documentation/PCI/pci-error-recovery.rst * @groups: Sysfs attribute groups. * @driver: Driver model structure. * @dynids: List of dynamically added device IDs. */ structpci_driver { structlist_headnode; constchar *name; conststructpci_device_id *id_table;/* Must be non-NULL for probe to be called */ int (*probe)(struct pci_dev *dev, conststruct pci_device_id *id); /* New device inserted */ void (*remove)(struct pci_dev *dev); /* Device removed (NULL if not a hot-plug capable driver) */ int (*suspend)(struct pci_dev *dev, pm_message_t state); /* Device suspended */ int (*suspend_late)(struct pci_dev *dev, pm_message_t state); int (*resume_early)(struct pci_dev *dev); int (*resume)(struct pci_dev *dev); /* Device woken up */ void (*shutdown)(struct pci_dev *dev); int (*sriov_configure)(struct pci_dev *dev, int num_vfs); /* On PF */ conststructpci_error_handlers *err_handler; conststructattribute_group **groups; structdevice_driverdriver; structpci_dynidsdynids; };
/* * Note about locking: There is no locking required until only one reader * and one writer is using the fifo and no kfifo_reset() will be called. * kfifo_reset_out() can be safely used, until it will be only called * in the reader thread. * For multiple writer and one reader there is only a need to lock the writer. * And vice versa for only one writer and multiple reader there is only a need * to lock the reader. */
/** * kfifo_in_spinlocked - put data into the fifo using a spinlock for locking * @fifo: address of the fifo to be used * @buf: the data to be added * @n: number of elements to be added * @lock: pointer to the spinlock to use for locking * * This macro copies the given values buffer into the fifo and returns the * number of copied elements. */ #define kfifo_in_spinlocked(fifo, buf, n, lock) \ ({ \ unsigned long __flags; \ unsigned int __ret; \ spin_lock_irqsave(lock, __flags); \ __ret = kfifo_in(fifo, buf, n); \ spin_unlock_irqrestore(lock, __flags); \ __ret; \ })
/** * kfifo_out_spinlocked - get data from the fifo using a spinlock for locking * @fifo: address of the fifo to be used * @buf: pointer to the storage buffer * @n: max. number of elements to get * @lock: pointer to the spinlock to use for locking * * This macro get the data from the fifo and return the numbers of elements * copied. */ #define kfifo_out_spinlocked(fifo, buf, n, lock) \ __kfifo_uint_must_check_helper( \ ({ \ unsigned long __flags; \ unsigned int __ret; \ spin_lock_irqsave(lock, __flags); \ __ret = kfifo_out(fifo, buf, n); \ spin_unlock_irqrestore(lock, __flags); \ __ret; \ }) \ )
staticinlinevoid *kvmalloc(size_t size, gfp_t flags) { return kvmalloc_node(size, flags, NUMA_NO_NODE); } /** * kvmalloc_node - attempt to allocate physically contiguous memory, but upon * failure, fall back to non-contiguous (vmalloc) allocation. * @size: size of the request. * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. * @node: numa node to allocate from * * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not * fall back to vmalloc. * * Return: pointer to the allocated memory of %NULL in case of failure */ void *kvmalloc_node(size_t size, gfp_t flags, int node)
/** * vmalloc - allocate virtually contiguous memory * @size: allocation size * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc(unsignedlong size) { return __vmalloc_node_flags(size, NUMA_NO_NODE, GFP_KERNEL); }
/** * kmalloc - allocate memory * @size: how many bytes of memory are required. * @flags: the type of memory to allocate. * * kmalloc is the normal method of allocating memory * for objects smaller than page size in the kernel. * * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN * bytes. For @size of power of two bytes, the alignment is also guaranteed * to be at least to the size. * * The @flags argument may be one of the GFP flags defined at * include/linux/gfp.h and described at * :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>` * * The recommended usage of the @flags is described at * :ref:`Documentation/core-api/memory-allocation.rst <memory-allocation>` * * Below is a brief outline of the most useful GFP flags * * %GFP_KERNEL * Allocate normal kernel ram. May sleep. * * %GFP_NOWAIT * Allocation will not sleep. * * %GFP_ATOMIC * Allocation will not sleep. May use emergency pools. * * %GFP_HIGHUSER * Allocate memory from high memory on behalf of user. * * Also it is possible to set different flags by OR'ing * in one or more of the following additional @flags: * * %__GFP_HIGH * This allocation has high priority and may use emergency pools. * * %__GFP_NOFAIL * Indicate that this allocation is in no way allowed to fail * (think twice before using). * * %__GFP_NORETRY * If memory is not immediately available, * then give up at once. * * %__GFP_NOWARN * If allocation fails, don't issue any warnings. * * %__GFP_RETRY_MAYFAIL * Try really hard to succeed the allocation but fail * eventually. */ static __always_inline void *kmalloc(size_t size, gfp_t flags)
delays - Information on the various kernel delay / sleep mechanisms -------------------------------------------------------------------
This document seeks to answer the common question: "What is the RightWay (TM) to insert a delay?"
This question is most often faced by driver writers who have to deal with hardware delays and who may not be the most intimately familiar with the inner workings of the Linux Kernel.
Inserting Delays ----------------
The first, and most important, question you need to ask is "Is my code in an atomic context?" This should be followed closely by "Does it really need to delay in atomic context?" If so...
ATOMIC CONTEXT: You must use the *delay family of functions. These functions use the jiffie estimation of clock speed and will busy wait for enough loop cycles to achieve the desired delay:
udelay is the generally preferred API; ndelay-level precision may not actually exist on many non-PC devices.
mdelay is macro wrapper around udelay, to account for possible overflow when passing large arguments to udelay. In general, use of mdelay is discouraged and code should be refactored to allow for the use of msleep.
NON-ATOMIC CONTEXT: You should use the *sleep[_range] family of functions. There are a few more options here, while any of them may work correctly, using the "right" sleep function will help the scheduler, power management, and just make your driver better :)
-- Backed by busy-wait loop: udelay(unsignedlong usecs) -- Backed by hrtimers: usleep_range(unsignedlong min, unsignedlong max) -- Backed by jiffies / legacy_timers msleep(unsignedlong msecs) msleep_interruptible(unsignedlong msecs)
Unlike the *delay family, the underlying mechanism driving each of these calls varies, thus there are quirks you should be aware of.
SLEEPING FOR "A FEW" USECS( < ~10us? ): * Use udelay
- Why not usleep? On slower systems, (embedded, OR perhaps a speed- stepped PC!) the overhead of setting up the hrtimers for usleep *may* not be worth it. Such an evaluation will obviously depend on your specific situation, but it is something to be aware of.
SLEEPING FOR ~USECS OR SMALL MSECS( 10us - 20ms): * Use usleep_range
- Why not msleep for(1ms - 20ms)? Explained originally here: http://lkml.org/lkml/2007/8/3/250 msleep(1~20) may not do what the caller intends, and will often sleep longer(~20 ms actual sleep for any value given in the 1~20ms range). In many cases this is not the desired behavior.
- Why is there no "usleep" / What is a good range? Since usleep_range is built on top of hrtimers, the wakeup will be very precise(ish), thus a simple usleep function would likely introduce a large number of undesired interrupts.
With the introduction of a range, the scheduler is free to coalesce your wakeup with any other wakeup that may have happened for other reasons, or at the worst case, fire an interrupt for your upper bound.
The larger a range you supply, the greater a chance that you will not trigger an interrupt; this should be balanced with what is an acceptable upper bound on delay / performance for your specific code path. Exact tolerances here are very situation specific, thus it is left to the caller to determine a reasonable range.
SLEEPING FOR LARGER MSECS( 10ms+ ) * Use msleep or possibly msleep_interruptible
- What's the difference? msleep sets the current task to TASK_UNINTERRUPTIBLE whereas msleep_interruptible sets the current task to TASK_INTERRUPTIBLE before scheduling the sleep. In short, the difference is whether the sleep can be ended early by a signal. In general, just use msleep unless you know you have a need for the interruptible variant.
ttyS0 is the device for the first UART serial port on x86 and x86_64 architectures. If you have a PC motherboard with serial ports you'd be using a ttySn to attach a modem or a serial console. ttyUSB0 is the device for the first USB serial convertor. If you have an USB serial cable you'd be using a ttyUSBn to connect to the serial port of a router. ttyAMA0 is the device for the first serial port on ARM architecture. If you have an ARM-based TV box with a serial console and running Android or OpenELEC, you'd be using a ttyAMAn to attach a console to it.
Useful for reliable timestamps and measuring short time intervals accurately. Starts at system boot time but stops during suspend.
ktime_tktime_get_boottime(void) CLOCK_BOOTTIME
Like ktime_get(), but does not stop when suspended. This can be used e.g. for key expiration times that need to be synchronized with other machines across a suspend operation.
ktime_tktime_get_real(void) CLOCK_REALTIME
Returns the time in relative to the UNIX epoch starting in 1970 using the Coordinated Universal Time(UTC), same as gettimeofday() user space. This is used for all timestamps that need to persist across a reboot, like inode times, but should be avoided for internal uses, since it can jump backwards due to a leap second update, NTP adjustment settimeofday() operation from user space.
ktime_tktime_get_clocktai(void) CLOCK_TAI
Like ktime_get_real(), but uses the International Atomic Time(TAI) reference instead of UTC to avoid jumping on leap second updates. This is rarely useful in the kernel.
ktime_tktime_get_raw(void) CLOCK_MONOTONIC_RAW
Like ktime_get(), but runs at the same rate as the hardware clocksource without(NTP) adjustments for clock drift. This is also rarely needed in the kernel.
quiet - this option tells the kernel to NOT produce any output (a.k.a. Non verbose mode). If you boot without this option, you’ll see lots of kernel messages such as drivers/modules activations, filesystem checks and errors. Not having the quiet parameter may be useful when you need to find an error.