Linux記憶體管理：DMA

發表於2015-09-25

說起DMA我們並不陌生，但是實際程式設計中去用的人不多吧，最多就是網路卡驅動裡的環形buffer，再有就是裝置的dma，下面我們就分析分析.
DMA用來在裝置記憶體和記憶體之間直接資料互動。而無需cpu干預

核心為了方便驅動的開發，已經提供了幾個dma 函式介面。

dma跟硬體架構相關，所以linux關於硬體部分已經給遮蔽了，有興趣的可以深入跟蹤學習.

按照linux核心對dma層的架構設計，各平臺dma緩衝區對映之間的差異由核心定義的一個dma操作集

include/linux/dma-mapping.h:

struct dma_map_ops {
    void* (*alloc)(struct device *dev, size_t size,
                dma_addr_t *dma_handle, gfp_t gfp,
                struct dma_attrs *attrs);
    void (*free)(struct device *dev, size_t size,
             void *vaddr, dma_addr_t dma_handle,
             struct dma_attrs *attrs);
    int (*mmap)(struct device *, struct vm_area_struct *,
             void *, dma_addr_t, size_t, struct dma_attrs *attrs);

    int (*get_sgtable)(struct device *dev, struct sg_table *sgt, void *,
             dma_addr_t, size_t, struct dma_attrs *attrs);

    dma_addr_t (*map_page)(struct device *dev, struct page *page,
             unsigned long offset, size_t size,
             enum dma_data_direction dir,
             struct dma_attrs *attrs);
    void (*unmap_page)(struct device *dev, dma_addr_t dma_handle,
             size_t size, enum dma_data_direction dir,
             struct dma_attrs *attrs);
    int (*map_sg)(struct device *dev, struct scatterlist *sg,
         int nents, enum dma_data_direction dir,
         struct dma_attrs *attrs);
    void (*unmap_sg)(struct device *dev,
             struct scatterlist *sg, int nents,
             enum dma_data_direction dir,
             struct dma_attrs *attrs);
    void (*sync_single_for_cpu)(struct device *dev,
                 dma_addr_t dma_handle, size_t size,
                 enum dma_data_direction dir);
    void (*sync_single_for_device)(struct device *dev,
                 dma_addr_t dma_handle, size_t size,
                 enum dma_data_direction dir);
    void (*sync_sg_for_cpu)(struct device *dev,
                struct scatterlist *sg, int nents,
                enum dma_data_direction dir);
    void (*sync_sg_for_device)(struct device *dev,
                 struct scatterlist *sg, int nents,
                 enum dma_data_direction dir);
    int (*mapping_error)(struct device *dev, dma_addr_t dma_addr);
    int (*dma_supported)(struct device *dev, u64 mask);
    int (*set_dma_mask)(struct device *dev, u64 mask);
#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
    u64 (*get_required_mask)(struct device *dev);
#endif
    int is_phys;
}

struct dma_map_ops {

void* (*alloc)(struct device *dev, size_t size,

dma_addr_t *dma_handle, gfp_t gfp,

struct dma_attrs *attrs);

void (*free)(struct device *dev, size_t size,

void *vaddr, dma_addr_t dma_handle,

struct dma_attrs *attrs);

int (*mmap)(struct device *, struct vm_area_struct *,

void *, dma_addr_t, size_t, struct dma_attrs *attrs);

int (*get_sgtable)(struct device *dev, struct sg_table *sgt, void *,

dma_addr_t, size_t, struct dma_attrs *attrs);

dma_addr_t (*map_page)(struct device *dev, struct page *page,

unsigned long offset, size_t size,

enum dma_data_direction dir,

struct dma_attrs *attrs);

void (*unmap_page)(struct device *dev, dma_addr_t dma_handle,

size_t size, enum dma_data_direction dir,

struct dma_attrs *attrs);

int (*map_sg)(struct device *dev, struct scatterlist *sg,

int nents, enum dma_data_direction dir,

struct dma_attrs *attrs);

void (*unmap_sg)(struct device *dev,

struct scatterlist *sg, int nents,

enum dma_data_direction dir,

struct dma_attrs *attrs);

void (*sync_single_for_cpu)(struct device *dev,

dma_addr_t dma_handle, size_t size,

enum dma_data_direction dir);

void (*sync_single_for_device)(struct device *dev,

dma_addr_t dma_handle, size_t size,

enum dma_data_direction dir);

void (*sync_sg_for_cpu)(struct device *dev,

struct scatterlist *sg, int nents,

enum dma_data_direction dir);

void (*sync_sg_for_device)(struct device *dev,

struct scatterlist *sg, int nents,

enum dma_data_direction dir);

int (*mapping_error)(struct device *dev, dma_addr_t dma_addr);

int (*dma_supported)(struct device *dev, u64 mask);

int (*set_dma_mask)(struct device *dev, u64 mask);

#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK

u64 (*get_required_mask)(struct device *dev);

#endif

int is_phys;

}

來統一遮蔽實現的差異.
不同差異主要來來自cache的問題
Cache與dma同步問題，這裡不深入討論.

另外一個常用的函式是Dma_set_mask，為了通知核心裝置能夠定址的範圍，很多時候裝置能夠定址的範圍有限。

Dma對映可以分為三類：

1. 一致性dma對映 dma_alloc_coherent （問題：驅動使用的buffer不是自身申請的，而是其他模組）
當驅動模組主動分配一個Dma緩衝區並且dma生存期和模組一樣時

引數說明：

（1）這個函式的返回值是緩衝的一個核心虛擬地址, 它可被驅動使用
（2）第三個引數dma_handle：
其間相關的實體地址在 dma_handle 中返回

2. 流式dma對映 dma_map_single
通常用於把核心一段buffer對映，返回實體地址.
如果驅動模組需要使用從別的模組傳進來的虛擬地址空間作為dma緩衝區，保證地址的線性 cache一致性
一致性api介面：sync_single_for_cpu

3.分散/聚集對映（scatter/gather map） Dma_map_sgs

有時候我們還需要
1. 回彈緩衝區 bounce buffer：當cpu側實體地址不適合裝置的dma操作的時候

2.
DmA記憶體池：一般dma對映都是單個page的整數倍，如果驅動程式需要更小的一致性對映的dma緩衝區，可以使用。類似於slab機制，
Dma_pool_create

下面我們就那網路卡驅動的例子說說dma的具體應用，參考linux kernel e1000網路卡
drivers/net/ethernet/intel/e1000/*
Ring buffer

Dma不能為高階記憶體，一般為32，預設低端記憶體，由於裝置能夠訪問的地址範圍有限。
裝置使用實體地址，而程式碼使用虛擬地址。

就看看如何傳送資料包：e1000_main.c:

e1000_xmit_frame: 關於幀的傳送流程這裡不多說.

static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
                 struct net_device *netdev)
{
    struct e1000_adapter *adapter = netdev_priv(netdev);
    struct e1000_hw *hw = &adapter->hw;
    struct e1000_tx_ring *tx_ring;
    unsigned int first, max_per_txd = E1000_MAX_DATA_PER_TXD;
    unsigned int max_txd_pwr = E1000_MAX_TXD_PWR;
    unsigned int tx_flags = 0;
    unsigned int len = skb_headlen(skb);
    unsigned int nr_frags;
    unsigned int mss;
    int count = 0;
    int tso;
    unsigned int f;

    /* This goes back to the question of how to logically map a tx queue
     * to a flow. Right now, performance is impacted slightly negatively
     * if using multiple tx queues. If the stack breaks away from a
     * single qdisc implementation, we can look at this again. */
    tx_ring = adapter->tx_ring;

    if (unlikely(skb->len <= 0)) {
        dev_kfree_skb_any(skb);
        return NETDEV_TX_OK;
    }

    /* On PCI/PCI-X HW, if packet size is less than ETH_ZLEN,
     * packets may get corrupted during padding by HW.
     * To WA this issue, pad all small packets manually.
     */
    if (skb->len < ETH_ZLEN) {
        if (skb_pad(skb, ETH_ZLEN - skb->len))
            return NETDEV_TX_OK;
        skb->len = ETH_ZLEN;
        skb_set_tail_pointer(skb, ETH_ZLEN);
    }

    mss = skb_shinfo(skb)->gso_size;
    /* The controller does a simple calculation to
     * make sure there is enough room in the FIFO before
     * initiating the DMA for each buffer. The calc is:
     * 4 = ceil(buffer len/mss). To make sure we don't
     * overrun the FIFO, adjust the max buffer len if mss
     * drops. */
    if (mss) {
        u8 hdr_len;
        max_per_txd = min(mss << 2, max_per_txd);
        max_txd_pwr = fls(max_per_txd) - 1;

        hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
        if (skb->data_len && hdr_len == len) {
            switch (hw->mac_type) {
                unsigned int pull_size;
            case e1000_82544:
                /* Make sure we have room to chop off 4 bytes,
                 * and that the end alignment will work out to
                 * this hardware's requirements
                 * NOTE: this is a TSO only workaround
                 * if end byte alignment not correct move us
                 * into the next dword */
                if ((unsigned long)(skb_tail_pointer(skb) - 1) & 4)
                    break;
                /* fall through */
                pull_size = min((unsigned int)4, skb->data_len);
                if (!__pskb_pull_tail(skb, pull_size)) {
                    e_err(drv, "__pskb_pull_tail "
                     "failed.\n");
                    dev_kfree_skb_any(skb);
                    return NETDEV_TX_OK;
                }
                len = skb_headlen(skb);
                break;
            default:
                /* do nothing */
                break;
            }
        }
    }

    /* reserve a descriptor for the offload context */
    if ((mss) || (skb->ip_summed == CHECKSUM_PARTIAL))
        count++;
    count++;

    /* Controller Erratum workaround */
    if (!skb->data_len && tx_ring->last_tx_tso && !skb_is_gso(skb))
        count++;

    count += TXD_USE_COUNT(len, max_txd_pwr);

    if (adapter->pcix_82544)
        count++;

    /* work-around for errata 10 and it applies to all controllers
     * in PCI-X mode, so add one more descriptor to the count
     */
    if (unlikely((hw->bus_type == e1000_bus_type_pcix) &&
            (len > 2015)))
        count++;

    nr_frags = skb_shinfo(skb)->nr_frags;
    for (f = 0; f < nr_frags; f++)
        count += TXD_USE_COUNT(skb_frag_size(&skb_shinfo(skb)->frags[f]),
                 max_txd_pwr);
    if (adapter->pcix_82544)
        count += nr_frags;

    /* need: count + 2 desc gap to keep tail from touching
     * head, otherwise try next time */
    if (unlikely(e1000_maybe_stop_tx(netdev, tx_ring, count + 2)))
        return NETDEV_TX_BUSY;

    if (unlikely((hw->mac_type == e1000_82547) &&
         (e1000_82547_fifo_workaround(adapter, skb)))) {
        netif_stop_queue(netdev);
        if (!test_bit(__E1000_DOWN, &adapter->flags))
            schedule_delayed_work(&adapter->fifo_stall_task, 1);
        return NETDEV_TX_BUSY;
    }

    if (vlan_tx_tag_present(skb)) {
        tx_flags |= E1000_TX_FLAGS_VLAN;
        tx_flags |= (vlan_tx_tag_get(skb) << E1000_TX_FLAGS_VLAN_SHIFT);
    }

    first = tx_ring->next_to_use;

    tso = e1000_tso(adapter, tx_ring, skb);
    if (tso < 0) {
        dev_kfree_skb_any(skb);
        return NETDEV_TX_OK;
    }

    if (likely(tso)) {
        if (likely(hw->mac_type != e1000_82544))
            tx_ring->last_tx_tso = true;
        tx_flags |= E1000_TX_FLAGS_TSO;
    } else if (likely(e1000_tx_csum(adapter, tx_ring, skb)))
        tx_flags |= E1000_TX_FLAGS_CSUM;

    if (likely(skb->protocol == htons(ETH_P_IP)))
        tx_flags |= E1000_TX_FLAGS_IPV4;

    if (unlikely(skb->no_fcs))
        tx_flags |= E1000_TX_FLAGS_NO_FCS;

    count = e1000_tx_map(adapter, tx_ring, skb, first, max_per_txd,
     nr_frags, mss);

    if (count) {
        netdev_sent_queue(netdev, skb->len);
        skb_tx_timestamp(skb);

        e1000_tx_queue(adapter, tx_ring, tx_flags, count);
        /* Make sure there is space in the ring for the next send. */
        e1000_maybe_stop_tx(netdev, tx_ring, MAX_SKB_FRAGS + 2);

    } else {
        dev_kfree_skb_any(skb);
        tx_ring->buffer_info[first].time_stamp = 0;
        tx_ring->next_to_use = first;
    }

    return NETDEV_TX_OK;
}

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,

struct net_device *netdev)

{

struct e1000_adapter *adapter = netdev_priv(netdev);

struct e1000_hw *hw = &adapter->hw;

struct e1000_tx_ring *tx_ring;

unsigned int first, max_per_txd = E1000_MAX_DATA_PER_TXD;

unsigned int max_txd_pwr = E1000_MAX_TXD_PWR;

unsigned int tx_flags = 0;

unsigned int len = skb_headlen(skb);

unsigned int nr_frags;

unsigned int mss;

int count = 0;

int tso;

unsigned int f;

/* This goes back to the question of how to logically map a tx queue

* to a flow. Right now, performance is impacted slightly negatively

* if using multiple tx queues. If the stack breaks away from a

* single qdisc implementation, we can look at this again. */

tx_ring = adapter->tx_ring;

if (unlikely(skb->len <= 0)) {

dev_kfree_skb_any(skb);

return NETDEV_TX_OK;

}

/* On PCI/PCI-X HW, if packet size is less than ETH_ZLEN,

* packets may get corrupted during padding by HW.

* To WA this issue, pad all small packets manually.

if (skb->len < ETH_ZLEN) {

if (skb_pad(skb, ETH_ZLEN - skb->len))

return NETDEV_TX_OK;

skb->len = ETH_ZLEN;

skb_set_tail_pointer(skb, ETH_ZLEN);

}

mss = skb_shinfo(skb)->gso_size;

/* The controller does a simple calculation to

* make sure there is enough room in the FIFO before

* initiating the DMA for each buffer. The calc is:

* 4 = ceil(buffer len/mss). To make sure we don't

* overrun the FIFO, adjust the max buffer len if mss

* drops. */

if (mss) {

u8 hdr_len;

max_per_txd = min(mss << 2, max_per_txd);

max_txd_pwr = fls(max_per_txd) - 1;

hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);

if (skb->data_len && hdr_len == len) {

switch (hw->mac_type) {

unsigned int pull_size;

case e1000_82544:

/* Make sure we have room to chop off 4 bytes,

* and that the end alignment will work out to

* this hardware's requirements

* NOTE: this is a TSO only workaround

* if end byte alignment not correct move us

* into the next dword */

if ((unsigned long)(skb_tail_pointer(skb) - 1) & 4)

break;

/* fall through */

pull_size = min((unsigned int)4, skb->data_len);

if (!__pskb_pull_tail(skb, pull_size)) {

e_err(drv, "__pskb_pull_tail "

"failed.\n");

dev_kfree_skb_any(skb);

return NETDEV_TX_OK;

}

len = skb_headlen(skb);

break;

default:

/* do nothing */

break;

}

/* reserve a descriptor for the offload context */

if ((mss) || (skb->ip_summed == CHECKSUM_PARTIAL))

count++;

/* Controller Erratum workaround */

if (!skb->data_len && tx_ring->last_tx_tso && !skb_is_gso(skb))

count++;

count += TXD_USE_COUNT(len, max_txd_pwr);

if (adapter->pcix_82544)

count++;

/* work-around for errata 10 and it applies to all controllers

* in PCI-X mode, so add one more descriptor to the count

if (unlikely((hw->bus_type == e1000_bus_type_pcix) &&

(len > 2015)))

count++;

nr_frags = skb_shinfo(skb)->nr_frags;

for (f = 0; f < nr_frags; f++)

count += TXD_USE_COUNT(skb_frag_size(&skb_shinfo(skb)->frags[f]),

max_txd_pwr);

if (adapter->pcix_82544)

count += nr_frags;

/* need: count + 2 desc gap to keep tail from touching

* head, otherwise try next time */

if (unlikely(e1000_maybe_stop_tx(netdev, tx_ring, count + 2)))

return NETDEV_TX_BUSY;

if (unlikely((hw->mac_type == e1000_82547) &&

(e1000_82547_fifo_workaround(adapter, skb)))) {

netif_stop_queue(netdev);

if (!test_bit(__E1000_DOWN, &adapter->flags))

schedule_delayed_work(&adapter->fifo_stall_task, 1);

return NETDEV_TX_BUSY;

}

if (vlan_tx_tag_present(skb)) {

tx_flags |= E1000_TX_FLAGS_VLAN;

tx_flags |= (vlan_tx_tag_get(skb) << E1000_TX_FLAGS_VLAN_SHIFT);

}

first = tx_ring->next_to_use;

tso = e1000_tso(adapter, tx_ring, skb);

if (tso < 0) {

dev_kfree_skb_any(skb);

return NETDEV_TX_OK;

}

if (likely(tso)) {

if (likely(hw->mac_type != e1000_82544))

tx_ring->last_tx_tso = true;

tx_flags |= E1000_TX_FLAGS_TSO;

} else if (likely(e1000_tx_csum(adapter, tx_ring, skb)))

tx_flags |= E1000_TX_FLAGS_CSUM;

if (likely(skb->protocol == htons(ETH_P_IP)))

tx_flags |= E1000_TX_FLAGS_IPV4;

if (unlikely(skb->no_fcs))

tx_flags |= E1000_TX_FLAGS_NO_FCS;

count = e1000_tx_map(adapter, tx_ring, skb, first, max_per_txd,

nr_frags, mss);

if (count) {

netdev_sent_queue(netdev, skb->len);

skb_tx_timestamp(skb);

e1000_tx_queue(adapter, tx_ring, tx_flags, count);

/* Make sure there is space in the ring for the next send. */

e1000_maybe_stop_tx(netdev, tx_ring, MAX_SKB_FRAGS + 2);

} else {

dev_kfree_skb_any(skb);

tx_ring->buffer_info[first].time_stamp = 0;

tx_ring->next_to_use = first;

}

return NETDEV_TX_OK;

}

經過上次，鄰居子系統後，資料幀已經到達驅動，資料放在skb指定的記憶體裡.
看程式碼
tx_ring = adapter->tx_ring; // 獲取傳送的ring buffer

接著我們看關鍵程式碼：
count = e1000_tx_map(adapter, tx_ring, skb, first, max_per_txd, nr_frags, mss);

它做了什麼呢？

static int e1000_tx_map(struct e1000_adapter *adapter,
            struct e1000_tx_ring *tx_ring,
            struct sk_buff *skb, unsigned int first,
            unsigned int max_per_txd, unsigned int nr_frags,
            unsigned int mss)
{
    struct e1000_hw *hw = &adapter->hw;
    struct pci_dev *pdev = adapter->pdev;
    struct e1000_buffer *buffer_info;
    unsigned int len = skb_headlen(skb);
    unsigned int offset = 0, size, count = 0, i;
    unsigned int f, bytecount, segs;

    i = tx_ring->next_to_use;

    while (len) {
        buffer_info = &tx_ring->buffer_info[i];
        size = min(len, max_per_txd);
        /* Workaround for Controller erratum --
         * descriptor for non-tso packet in a linear SKB that follows a
         * tso gets written back prematurely before the data is fully
         * DMA'd to the controller */
        if (!skb->data_len && tx_ring->last_tx_tso &&
         !skb_is_gso(skb)) {
            tx_ring->last_tx_tso = false;
            size -= 4;
        }

        /* Workaround for premature desc write-backs
         * in TSO mode. Append 4-byte sentinel desc */
        if (unlikely(mss && !nr_frags && size == len && size > 8))
            size -= 4;
        /* work-around for errata 10 and it applies
         * to all controllers in PCI-X mode
         * The fix is to make sure that the first descriptor of a
         * packet is smaller than 2048 - 16 - 16 (or 2016) bytes
         */
        if (unlikely((hw->bus_type == e1000_bus_type_pcix) &&
         (size > 2015) && count == 0))
         size = 2015;

        /* Workaround for potential 82544 hang in PCI-X. Avoid
         * terminating buffers within evenly-aligned dwords. */
        if (unlikely(adapter->pcix_82544 &&
         !((unsigned long)(skb->data + offset + size - 1) & 4) &&
         size > 4))
            size -= 4;

        buffer_info->length = size;
        /* set time_stamp *before* dma to help avoid a possible race */
        buffer_info->time_stamp = jiffies;
        buffer_info->mapped_as_page = false;
        buffer_info->dma = dma_map_single(&pdev->dev,
                         skb->data + offset,
                         size,    DMA_TO_DEVICE);
        if (dma_mapping_error(&pdev->dev, buffer_info->dma))
            goto dma_error;
        buffer_info->next_to_watch = i;

        len -= size;
        offset += size;
        count++;
        if (len) {
            i++;
            if (unlikely(i == tx_ring->count))
                i = 0;
        }
    }

    for (f = 0; f < nr_frags; f++) {
        const struct skb_frag_struct *frag;

        frag = &skb_shinfo(skb)->frags[f];
        len = skb_frag_size(frag);
        offset = 0;

        while (len) {
            unsigned long bufend;
            i++;
            if (unlikely(i == tx_ring->count))
                i = 0;

            buffer_info = &tx_ring->buffer_info[i];
            size = min(len, max_per_txd);
            /* Workaround for premature desc write-backs
             * in TSO mode. Append 4-byte sentinel desc */
            if (unlikely(mss && f == (nr_frags-1) && size == len && size > 8))
                size -= 4;
            /* Workaround for potential 82544 hang in PCI-X.
             * Avoid terminating buffers within evenly-aligned
             * dwords. */
            bufend = (unsigned long)
                page_to_phys(skb_frag_page(frag));
            bufend += offset + size - 1;
            if (unlikely(adapter->pcix_82544 &&
                 !(bufend & 4) &&
                 size > 4))
                size -= 4;

            buffer_info->length = size;
            buffer_info->time_stamp = jiffies;
            buffer_info->mapped_as_page = true;
            buffer_info->dma = skb_frag_dma_map(&pdev->dev, frag,
                        offset, size, DMA_TO_DEVICE);
            if (dma_mapping_error(&pdev->dev, buffer_info->dma))
                goto dma_error;
            buffer_info->next_to_watch = i;

            len -= size;
            offset += size;
            count++;
        }
    }

    segs = skb_shinfo(skb)->gso_segs ?: 1;
    /* multiply data chunks by size of headers */
    bytecount = ((segs - 1) * skb_headlen(skb)) + skb->len;

    tx_ring->buffer_info[i].skb = skb;
    tx_ring->buffer_info[i].segs = segs;
    tx_ring->buffer_info[i].bytecount = bytecount;
    tx_ring->buffer_info[first].next_to_watch = i;

    return count;

dma_error:
    dev_err(&pdev->dev, "TX DMA map failed\n");
    buffer_info->dma = 0;
    if (count)
        count--;

    while (count--) {
        if (i==0)
            i += tx_ring->count;
        i--;
        buffer_info = &tx_ring->buffer_info[i];
        e1000_unmap_and_free_tx_resource(adapter, buffer_info);
    }

    return 0;
}

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

static int e1000_tx_map(struct e1000_adapter *adapter,

struct e1000_tx_ring *tx_ring,

struct sk_buff *skb, unsigned int first,

unsigned int max_per_txd, unsigned int nr_frags,

unsigned int mss)

{

struct e1000_hw *hw = &adapter->hw;

struct pci_dev *pdev = adapter->pdev;

struct e1000_buffer *buffer_info;

unsigned int len = skb_headlen(skb);

unsigned int offset = 0, size, count = 0, i;

unsigned int f, bytecount, segs;

i = tx_ring->next_to_use;

while (len) {

buffer_info = &tx_ring->buffer_info[i];

size = min(len, max_per_txd);

/* Workaround for Controller erratum --

* descriptor for non-tso packet in a linear SKB that follows a

* tso gets written back prematurely before the data is fully

* DMA'd to the controller */

if (!skb->data_len && tx_ring->last_tx_tso &&

!skb_is_gso(skb)) {

tx_ring->last_tx_tso = false;

size -= 4;

}

/* Workaround for premature desc write-backs

* in TSO mode. Append 4-byte sentinel desc */

if (unlikely(mss && !nr_frags && size == len && size > 8))

size -= 4;

/* work-around for errata 10 and it applies

* to all controllers in PCI-X mode

* The fix is to make sure that the first descriptor of a

* packet is smaller than 2048 - 16 - 16 (or 2016) bytes

if (unlikely((hw->bus_type == e1000_bus_type_pcix) &&

(size > 2015) && count == 0))

size = 2015;

/* Workaround for potential 82544 hang in PCI-X. Avoid

* terminating buffers within evenly-aligned dwords. */

if (unlikely(adapter->pcix_82544 &&

!((unsigned long)(skb->data + offset + size - 1) & 4) &&

size > 4))

size -= 4;

buffer_info->length = size;

/* set time_stamp *before* dma to help avoid a possible race */

buffer_info->time_stamp = jiffies;

buffer_info->mapped_as_page = false;

buffer_info->dma = dma_map_single(&pdev->dev,

skb->data + offset,

size, DMA_TO_DEVICE);

if (dma_mapping_error(&pdev->dev, buffer_info->dma))

goto dma_error;

buffer_info->next_to_watch = i;

len -= size;

offset += size;

count++;

if (len) {

i++;

if (unlikely(i == tx_ring->count))

i = 0;

}

for (f = 0; f < nr_frags; f++) {

const struct skb_frag_struct *frag;

frag = &skb_shinfo(skb)->frags[f];

len = skb_frag_size(frag);

offset = 0;

while (len) {

unsigned long bufend;

i++;

if (unlikely(i == tx_ring->count))

i = 0;

buffer_info = &tx_ring->buffer_info[i];

size = min(len, max_per_txd);

/* Workaround for premature desc write-backs

* in TSO mode. Append 4-byte sentinel desc */

if (unlikely(mss && f == (nr_frags-1) && size == len && size > 8))

size -= 4;

/* Workaround for potential 82544 hang in PCI-X.

* Avoid terminating buffers within evenly-aligned

* dwords. */

bufend = (unsigned long)

page_to_phys(skb_frag_page(frag));

bufend += offset + size - 1;

if (unlikely(adapter->pcix_82544 &&

!(bufend & 4) &&

size > 4))

size -= 4;

buffer_info->length = size;

buffer_info->time_stamp = jiffies;

buffer_info->mapped_as_page = true;

buffer_info->dma = skb_frag_dma_map(&pdev->dev, frag,

offset, size, DMA_TO_DEVICE);

if (dma_mapping_error(&pdev->dev, buffer_info->dma))

goto dma_error;

buffer_info->next_to_watch = i;

len -= size;

offset += size;

count++;

}

segs = skb_shinfo(skb)->gso_segs ?: 1;

/* multiply data chunks by size of headers */

bytecount = ((segs - 1) * skb_headlen(skb)) + skb->len;

tx_ring->buffer_info[i].skb = skb;

tx_ring->buffer_info[i].segs = segs;

tx_ring->buffer_info[i].bytecount = bytecount;

tx_ring->buffer_info[first].next_to_watch = i;

return count;

dma_error:

dev_err(&pdev->dev, "TX DMA map failed\n");

buffer_info->dma = 0;

if (count)

count--;

while (count--) {

if (i==0)

i += tx_ring->count;

i--;

buffer_info = &tx_ring->buffer_info[i];

e1000_unmap_and_free_tx_resource(adapter, buffer_info);

}

return 0;

}

預設資料包文沒有分片或者碎片什麼的。
那麼進入第一個while(len)

獲取buffer_info = &tx_ring->buffer_info[i];
然後：呼叫dma_map_single進行流式對映. 即把skb->data(虛擬地址) 和buffer_info->dma（實體地址）對應起來.操作兩個地址等於操作同一片區域。

buffer_info->length = size;
        /* set time_stamp *before* dma to help avoid a possible race */
        buffer_info->time_stamp = jiffies;
        buffer_info->mapped_as_page = false;
        buffer_info->dma = dma_map_single(&pdev->dev,
                         skb->data + offset,
                         size,    DMA_TO_DEVICE);

buffer_info->length = size;

/* set time_stamp *before* dma to help avoid a possible race */

buffer_info->time_stamp = jiffies;

buffer_info->mapped_as_page = false;

buffer_info->dma = dma_map_single(&pdev->dev,

skb->data + offset,

size, DMA_TO_DEVICE);

回到主傳送函式：

if (count) {
        netdev_sent_queue(netdev, skb->len);
        skb_tx_timestamp(skb);

        e1000_tx_queue(adapter, tx_ring, tx_flags, count);
        /* Make sure there is space in the ring for the next send. */
        e1000_maybe_stop_tx(netdev, tx_ring, MAX_SKB_FRAGS + 2);

    }

if (count) {

netdev_sent_queue(netdev, skb->len);

skb_tx_timestamp(skb);

e1000_tx_queue(adapter, tx_ring, tx_flags, count);

/* Make sure there is space in the ring for the next send. */

e1000_maybe_stop_tx(netdev, tx_ring, MAX_SKB_FRAGS + 2);

}

呼叫e1000_tx_queue把資料傳送出去：

static void e1000_tx_queue(struct e1000_adapter *adapter,
             struct e1000_tx_ring *tx_ring, int tx_flags,
             int count)
{
    struct e1000_hw *hw = &adapter->hw;
    struct e1000_tx_desc *tx_desc = NULL;
    struct e1000_buffer *buffer_info;
    u32 txd_upper = 0, txd_lower = E1000_TXD_CMD_IFCS;
    unsigned int i;

    ...

    i = tx_ring->next_to_use;

    while (count--) {
        buffer_info = &tx_ring->buffer_info[i];
        tx_desc = E1000_TX_DESC(*tx_ring, i);
        tx_desc->buffer_addr = cpu_to_le64(buffer_info->dma);
        tx_desc->lower.data =
            cpu_to_le32(txd_lower | buffer_info->length);
        tx_desc->upper.data = cpu_to_le32(txd_upper);
        if (unlikely(++i == tx_ring->count)) i = 0;
    }

    tx_desc->lower.data |= cpu_to_le32(adapter->txd_cmd);

    /* txd_cmd re-enables FCS, so we'll re-disable it here as desired. */
    if (unlikely(tx_flags & E1000_TX_FLAGS_NO_FCS))
        tx_desc->lower.data &= ~(cpu_to_le32(E1000_TXD_CMD_IFCS));

    /* Force memory writes to complete before letting h/w
     * know there are new descriptors to fetch. (Only
     * applicable for weak-ordered memory model archs,
     * such as IA-64). */
    wmb();

    tx_ring->next_to_use = i;
    writel(i, hw->hw_addr + tx_ring->tdt);
    /* we need this if more than one processor can write to our tail
     * at a time, it syncronizes IO on IA64/Altix systems */
    mmiowb();
}

static void e1000_tx_queue(struct e1000_adapter *adapter,

struct e1000_tx_ring *tx_ring, int tx_flags,

int count)

{

struct e1000_hw *hw = &adapter->hw;

struct e1000_tx_desc *tx_desc = NULL;

struct e1000_buffer *buffer_info;

u32 txd_upper = 0, txd_lower = E1000_TXD_CMD_IFCS;

unsigned int i;

...

i = tx_ring->next_to_use;

while (count--) {

buffer_info = &tx_ring->buffer_info[i];

tx_desc = E1000_TX_DESC(*tx_ring, i);

tx_desc->buffer_addr = cpu_to_le64(buffer_info->dma);

tx_desc->lower.data =

cpu_to_le32(txd_lower | buffer_info->length);

tx_desc->upper.data = cpu_to_le32(txd_upper);

if (unlikely(++i == tx_ring->count)) i = 0;

}

tx_desc->lower.data |= cpu_to_le32(adapter->txd_cmd);

/* txd_cmd re-enables FCS, so we'll re-disable it here as desired. */

if (unlikely(tx_flags & E1000_TX_FLAGS_NO_FCS))

tx_desc->lower.data &= ~(cpu_to_le32(E1000_TXD_CMD_IFCS));

/* Force memory writes to complete before letting h/w

* know there are new descriptors to fetch. (Only

* applicable for weak-ordered memory model archs,

* such as IA-64). */

wmb();

tx_ring->next_to_use = i;

writel(i, hw->hw_addr + tx_ring->tdt);

/* we need this if more than one processor can write to our tail

* at a time, it syncronizes IO on IA64/Altix systems */

mmiowb();

}

我們看到它把剛才dma_map_singe裡的對映賦值了：
tx_desc->buffer_addr = cpu_to_le64(buffer_info->dma);
說明傳送的時候是根據傳送描述符來傳送的。

然後操作暫存器：
writel(i, hw->hw_addr + tx_ring->tdt);
那麼網路卡就會自動讀取tx desc 然後把資料傳送出去。

總結下流程：
1. linux os會呼叫網路卡的start_xmit（）函式。在e1000裡，對應的函式是 e1000_xmit_frame,
2. e1000_xmit_frame又會呼叫e1000_tx_queue(adapter, tx_ring, tx_flags, count)。
這裡的tx_queue指的是傳送Descriptor的queue。
3. e1000_tx_queue 在檢查了一些引數後，最終呼叫 writel(i, hw->hw_addr + tx_ring->tdt)。
這裡的tx_ring->tdt中的tdt全寫為 tx_descriptor_tail。從網路卡的開發手冊中可以查到，如果寫了descriptor tail，那麼網路卡就會自動讀取 descriptor,然後把包傳送出去。

descroptor的主要內容是addr pointer和length。前者是要傳送的包的起始實體地址。後者是包的長度。有了這些，硬體就可以通過dma來讀取包併發出去了。其他網路卡也基本會用descriptor的結構。

雖然流程明白了，但是還有幾個點，
1. tx_ring在哪初始化？
2. 網路卡到底是如何操作對映的dma地址的，把資料傳送出去的？

tx ring 在e1000_open 的時候：
呼叫：

/**
 * e1000_setup_all_tx_resources - wrapper to allocate Tx resources
 *                  (Descriptors) for all queues
 * @adapter: board private structure
 *
 * Return 0 on success, negative on failure
 **/

int e1000_setup_all_tx_resources(struct e1000_adapter *adapter)
{
    int i, err = 0;

    for (i = 0; i < adapter->num_tx_queues; i++) {
        err = e1000_setup_tx_resources(adapter, &adapter->tx_ring[i]);
        if (err) {
            e_err(probe, "Allocation for Tx Queue %u failed\n", i);
            for (i-- ; i >= 0; i--)
                e1000_free_tx_resources(adapter,
                            &adapter->tx_ring[i]);
            break;
        }
    }

    return err;
}

/**

* e1000_setup_all_tx_resources - wrapper to allocate Tx resources

* (Descriptors) for all queues

* @adapter: board private structure

* Return 0 on success, negative on failure

**/

int e1000_setup_all_tx_resources(struct e1000_adapter *adapter)

{

int i, err = 0;

for (i = 0; i < adapter->num_tx_queues; i++) {

err = e1000_setup_tx_resources(adapter, &adapter->tx_ring[i]);

if (err) {

e_err(probe, "Allocation for Tx Queue %u failed\n", i);

for (i-- ; i >= 0; i--)

e1000_free_tx_resources(adapter,

&adapter->tx_ring[i]);

break;

}

return err;

}

/**
 * e1000_setup_tx_resources - allocate Tx resources (Descriptors)
 * @adapter: board private structure
 * @txdr: tx descriptor ring (for a specific queue) to setup
 *
 * Return 0 on success, negative on failure
 **/

static int e1000_setup_tx_resources(struct e1000_adapter *adapter,
                 struct e1000_tx_ring *txdr)
{
    struct pci_dev *pdev = adapter->pdev;
    int size;

    size = sizeof(struct e1000_buffer) * txdr->count;
    txdr->buffer_info = vzalloc(size);
    if (!txdr->buffer_info) {
        e_err(probe, "Unable to allocate memory for the Tx descriptor "
         "ring\n");
        return -ENOMEM;
    }

    /* round up to nearest 4K */

    txdr->size = txdr->count * sizeof(struct e1000_tx_desc);
    txdr->size = ALIGN(txdr->size, 4096);

    txdr->desc = dma_alloc_coherent(&pdev->dev, txdr->size, &txdr->dma,
                    GFP_KERNEL);
    if (!txdr->desc) {
setup_tx_desc_die:
        vfree(txdr->buffer_info);
        e_err(probe, "Unable to allocate memory for the Tx descriptor "
         "ring\n");
        return -ENOMEM;
    }

    /* Fix for errata 23, can't cross 64kB boundary */
    if (!e1000_check_64k_bound(adapter, txdr->desc, txdr->size)) {
        void *olddesc = txdr->desc;
        dma_addr_t olddma = txdr->dma;
        e_err(tx_err, "txdr align check failed: %u bytes at %p\n",
         txdr->size, txdr->desc);
        /* Try again, without freeing the previous */
        txdr->desc = dma_alloc_coherent(&pdev->dev, txdr->size,
                        &txdr->dma, GFP_KERNEL);
        /* Failed allocation, critical failure */
        if (!txdr->desc) {
            dma_free_coherent(&pdev->dev, txdr->size, olddesc,
                     olddma);
            goto setup_tx_desc_die;
        }

        if (!e1000_check_64k_bound(adapter, txdr->desc, txdr->size)) {
            /* give up */
            dma_free_coherent(&pdev->dev, txdr->size, txdr->desc,
                     txdr->dma);
            dma_free_coherent(&pdev->dev, txdr->size, olddesc,
                     olddma);
            e_err(probe, "Unable to allocate aligned memory "
             "for the transmit descriptor ring\n");
            vfree(txdr->buffer_info);
            return -ENOMEM;
        } else {
            /* Free old allocation, new allocation was successful */
            dma_free_coherent(&pdev->dev, txdr->size, olddesc,
                     olddma);
        }
    }
    memset(txdr->desc, 0, txdr->size);

    txdr->next_to_use = 0;
    txdr->next_to_clean = 0;

    return 0;
}

/**

* e1000_setup_tx_resources - allocate Tx resources (Descriptors)

* @adapter: board private structure

* @txdr: tx descriptor ring (for a specific queue) to setup

* Return 0 on success, negative on failure

**/

static int e1000_setup_tx_resources(struct e1000_adapter *adapter,

struct e1000_tx_ring *txdr)

{

struct pci_dev *pdev = adapter->pdev;

int size;

size = sizeof(struct e1000_buffer) * txdr->count;

txdr->buffer_info = vzalloc(size);

if (!txdr->buffer_info) {

e_err(probe, "Unable to allocate memory for the Tx descriptor "

"ring\n");

return -ENOMEM;

}

/* round up to nearest 4K */

txdr->size = txdr->count * sizeof(struct e1000_tx_desc);

txdr->size = ALIGN(txdr->size, 4096);

txdr->desc = dma_alloc_coherent(&pdev->dev, txdr->size, &txdr->dma,

GFP_KERNEL);

if (!txdr->desc) {

setup_tx_desc_die:

vfree(txdr->buffer_info);

e_err(probe, "Unable to allocate memory for the Tx descriptor "

"ring\n");

return -ENOMEM;

}

/* Fix for errata 23, can't cross 64kB boundary */

if (!e1000_check_64k_bound(adapter, txdr->desc, txdr->size)) {

void *olddesc = txdr->desc;

dma_addr_t olddma = txdr->dma;

e_err(tx_err, "txdr align check failed: %u bytes at %p\n",

txdr->size, txdr->desc);

/* Try again, without freeing the previous */

txdr->desc = dma_alloc_coherent(&pdev->dev, txdr->size,

&txdr->dma, GFP_KERNEL);

/* Failed allocation, critical failure */

if (!txdr->desc) {

dma_free_coherent(&pdev->dev, txdr->size, olddesc,

olddma);

goto setup_tx_desc_die;

}

if (!e1000_check_64k_bound(adapter, txdr->desc, txdr->size)) {

/* give up */

dma_free_coherent(&pdev->dev, txdr->size, txdr->desc,

txdr->dma);

dma_free_coherent(&pdev->dev, txdr->size, olddesc,

olddma);

e_err(probe, "Unable to allocate aligned memory "

"for the transmit descriptor ring\n");

vfree(txdr->buffer_info);

return -ENOMEM;

} else {

/* Free old allocation, new allocation was successful */

dma_free_coherent(&pdev->dev, txdr->size, olddesc,

olddma);

}

memset(txdr->desc, 0, txdr->size);

txdr->next_to_use = 0;

txdr->next_to_clean = 0;

return 0;

}

我們看：它建立了一致性dma對映.

        txdr->desc = dma_alloc_coherent(&pdev->dev, txdr->size,
                        &txdr->dma, GFP_KERNEL);

1 2	txdr->desc = dma_alloc_coherent(&pdev->dev, txdr->size, &txdr->dma, GFP_KERNEL);

desc是結構指標：它的結構跟網路卡暫存器結構有關，e1000_hw.h

/* Transmit Descriptor */
struct e1000_tx_desc {
    __le64 buffer_addr;    /* Address of the descriptor's data buffer */
    union {
        __le32 data;
        struct {
            __le16 length;    /* Data buffer length */
            u8 cso;    /* Checksum offset */
            u8 cmd;    /* Descriptor control */
        } flags;
    } lower;
    union {
        __le32 data;
        struct {
            u8 status;    /* Descriptor status */
            u8 css;    /* Checksum start */
            __le16 special;
        } fields;
    } upper;
}

/* Transmit Descriptor */

struct e1000_tx_desc {

__le64 buffer_addr; /* Address of the descriptor's data buffer */

union {

__le32 data;

struct {

__le16 length; /* Data buffer length */

u8 cso; /* Checksum offset */

u8 cmd; /* Descriptor control */

} flags;

} lower;

union {

__le32 data;

struct {

u8 status; /* Descriptor status */

u8 css; /* Checksum start */

__le16 special;

} fields;

} upper;

}

我們稍微屢一下，

 skb->data  --- ring->buffer_info->dma
ring->dma  ---  ring->desc
 ring->desc->buffer_addr ---ring->buffer_info->dma

skb->data --- ring->buffer_info->dma

ring->dma --- ring->desc

ring->desc->buffer_addr ---ring->buffer_info->dma

那麼網路卡又是如何和dma地址關聯的呢?

/**
 * e1000_configure_tx - Configure 8254x Transmit Unit after Reset
 * @adapter: board private structure
 *
 * Configure the Tx unit of the MAC after a reset.
 **/

static void e1000_configure_tx(struct e1000_adapter *adapter)
{
    u64 tdba;
    struct e1000_hw *hw = &adapter->hw;
    u32 tdlen, tctl, tipg;
    u32 ipgr1, ipgr2;

    /* Setup the HW Tx Head and Tail descriptor pointers */

    switch (adapter->num_tx_queues) {
    case 1:
    default:
        tdba = adapter->tx_ring[0].dma;
        tdlen = adapter->tx_ring[0].count *
            sizeof(struct e1000_tx_desc);
        ew32(TDLEN, tdlen);
        ew32(TDBAH, (tdba >> 32));
        ew32(TDBAL, (tdba & 0x00000000ffffffffULL));
        ew32(TDT, 0);
        ew32(TDH, 0);
        adapter->tx_ring[0].tdh = ((hw->mac_type >= e1000_82543) ? E1000_TDH : E1000_82542_TDH);
        adapter->tx_ring[0].tdt = ((hw->mac_type >= e1000_82543) ? E1000_TDT : E1000_82542_TDT);
        break;
    }

/**

* e1000_configure_tx - Configure 8254x Transmit Unit after Reset

* @adapter: board private structure

* Configure the Tx unit of the MAC after a reset.

**/

static void e1000_configure_tx(struct e1000_adapter *adapter)

{

u64 tdba;

struct e1000_hw *hw = &adapter->hw;

u32 tdlen, tctl, tipg;

u32 ipgr1, ipgr2;

/* Setup the HW Tx Head and Tail descriptor pointers */

switch (adapter->num_tx_queues) {

case 1:

default:

tdba = adapter->tx_ring[0].dma;

tdlen = adapter->tx_ring[0].count *

sizeof(struct e1000_tx_desc);

ew32(TDLEN, tdlen);

ew32(TDBAH, (tdba >> 32));

ew32(TDBAL, (tdba & 0x00000000ffffffffULL));

ew32(TDT, 0);

ew32(TDH, 0);

adapter->tx_ring[0].tdh = ((hw->mac_type >= e1000_82543) ? E1000_TDH : E1000_82542_TDH);

adapter->tx_ring[0].tdt = ((hw->mac_type >= e1000_82543) ? E1000_TDT : E1000_82542_TDT);

break;

}

很明顯它把dma地址寫入了網路卡dma暫存器。所以dma還需要網路卡硬體的支援才行.

當然e1000這個網路卡驅動還是相當的複雜,不過它把一致性對映和流式對映都用上了。

Linux實體記憶體管理
2024-11-28
Linux記憶體
Arm64記憶體模型、記憶體型別、效能與DMA
2024-10-25
記憶體模型型別
Linux 記憶體區管理 slab
2024-04-26
Linux記憶體
linux記憶體管理（二）- vmalloc
2024-06-11
Linux記憶體
Linux共享記憶體的管理
2018-06-07
Linux記憶體
linux記憶體管理（一）實體記憶體的組織和記憶體分配
2024-06-07
Linux記憶體
Linux記憶體洩露案例分析和記憶體管理分享
2024-10-24
Linux記憶體洩露
記憶體管理記憶體管理概述
2020-11-03
記憶體
Linux 記憶體管理 pt.3
2023-05-17
Linux記憶體
Linux 記憶體管理 pt.2
2023-05-05
Linux記憶體
Linux 記憶體管理 pt.1
2023-04-27
Linux記憶體
Linux-記憶體和磁碟管理
2022-02-14
Linux記憶體
Linux的記憶體分頁管理
2020-03-26
Linux記憶體
Linux 的記憶體分頁管理
2018-08-08
Linux記憶體
記憶體管理篇——實體記憶體的管理
2022-02-23
記憶體
linux記憶體管理學習總結
2024-11-04
Linux記憶體
【記憶體管理】記憶體佈局
2024-06-10
記憶體
linux 非連續記憶體區管理 vmalloc
2024-04-26
Linux記憶體
linux記憶體管理（十）- 頁面回收（二）
2024-06-18
Linux記憶體
linux記憶體管理（十一）- 頁面遷移
2024-06-18
Linux記憶體
linux記憶體管理（八）- 反向對映RMAP
2024-06-15
Linux記憶體
linux記憶體管理（六）- 核心新struct - folio
2024-06-11
Linux記憶體Struct
淺析Linux Kernel[5.11.0]記憶體管理（一）
2022-01-18
Linux記憶體
Linux使用者空間記憶體管理
2018-09-26
Linux記憶體
記憶體管理兩部曲之實體記憶體管理
2021-05-22
記憶體
Go：記憶體管理與記憶體清理
2020-08-04
Go記憶體
Java的記憶體 -JVM 記憶體管理
2018-08-20
Java記憶體JVM
【記憶體管理】Oracle AMM自動記憶體管理詳解
2020-08-27
記憶體Oracle
記憶體管理兩部曲之虛擬記憶體管理
2021-05-31
記憶體
Linux堆記憶體管理深入分析(下半部)
2020-08-19
Linux記憶體
[Linux]共享記憶體
2024-12-07
Linux記憶體
Linux核心筆記004 - 從記憶體管理開始，認識Linux核心
2020-05-28
Linux筆記記憶體
Flink記憶體管理
2022-12-11
記憶體
記憶體管理-swMemoryGlobal
2019-09-05
記憶體
MySQL記憶體管理
2021-01-03
MySql記憶體
JavaScript 記憶體管理
2018-11-02
JavaScript記憶體
iOS 記憶體管理
2018-12-20
iOS記憶體
Android記憶體管理
2018-06-13
Android記憶體
OC記憶體管理
2018-08-29
記憶體

Linux記憶體管理：DMA

相關文章