perf mem -D report - linux

I was using perf mem -t load record "commands" to profile system memory access latency. After, I run perf mem -D report and I got the following results:
[root#mdtm-server wenji]# perf mem -D report
# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL
2054 2054 0xffffffff811186bf 0x016ffffe8fbffc804b0 49 0x68100842 /lib/modules/3.12.23/build/vmlinux:perf_event_aux_ctx
2054 2054 0xffffffff81321d6e 0xffff880c7fc87d44 7 0x68100142 /lib/modules/3.12.23/build/vmlinux:ghes_copy_tofrom_phys
What does "ADDR", "DSRC", "SYMBOL" mean?

IP - PC of the load/store instruction;
SYMBOL - name of function, containing this instruction (IP);
ADDR - virtual memory address of data, requested by load/store (if there was no --phys-data option)
DSRC - "Decoded Source".
DSRC - There was recommendation to check "SDM Vol 3b Table 18-41 (Layout of Data Linear Address Information in PEBS Record)" in some mailing lists.
There is also DSRC coding code in kernel (dse from hw - PEBS; u64 return is dsrc):
http://lxr.free-electrons.com/source/arch/x86/kernel/cpu/perf_event_intel_ds.c?v=4.3#L28
28 union intel_x86_pebs_dse {
29 u64 val;
30 struct {
31 unsigned int ld_dse:4;
32 unsigned int ld_stlb_miss:1;
33 unsigned int ld_locked:1;
34 unsigned int ld_reserved:26;
35 };
36 struct {
37 unsigned int st_l1d_hit:1;
38 unsigned int st_reserved1:3;
39 unsigned int st_stlb_miss:1;
40 unsigned int st_locked:1;
41 unsigned int st_reserved2:26;
42 };
43 };
http://lxr.free-electrons.com/source/arch/x86/kernel/cpu/perf_event_intel_ds.c?v=4.3#L46
46 /*
47 * Map PEBS Load Latency Data Source encodings to generic
48 * memory data source information
49 */
50 #define P(a, b) PERF_MEM_S(a, b)
51 #define OP_LH (P(OP, LOAD) | P(LVL, HIT))
52 #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
53
54 static const u64 pebs_data_source[] = {
55 P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
56 OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */
57 OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
58 OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */
59 OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */
60 OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */
61 OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */
62 OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */
63 OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */
64 OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
65 OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */
66 OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */
67 OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
68 OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
69 OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */
70 OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
71 };
72
73 static u64 precise_store_data(u64 status)
74 {
75 union intel_x86_pebs_dse dse;
76 u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2);
77
78 dse.val = status;
79
80 /*
81 * bit 4: TLB access
82 * 1 = stored missed 2nd level TLB
83 *
84 * so it either hit the walker or the OS
85 * otherwise hit 2nd level TLB
86 */
87 if (dse.st_stlb_miss)
88 val |= P(TLB, MISS);
89 else
90 val |= P(TLB, HIT);
91
92 /*
93 * bit 0: hit L1 data cache
94 * if not set, then all we know is that
95 * it missed L1D
96 */
97 if (dse.st_l1d_hit)
98 val |= P(LVL, HIT);
99 else
100 val |= P(LVL, MISS);
101
102 /*
103 * bit 5: Locked prefix
104 */
105 if (dse.st_locked)
106 val |= P(LOCK, LOCKED);
107
108 return val;
109 }
dsrc sounds like several combinations of PERF_MEM_* macro in bitfields:
http://lxr.free-electrons.com/source/include/uapi/linux/perf_event.h?v=4.3#L878
878 union perf_mem_data_src {
879 __u64 val;
880 struct {
881 __u64 mem_op:5, /* type of opcode */
882 mem_lvl:14, /* memory hierarchy level */
883 mem_snoop:5, /* snoop mode */
884 mem_lock:2, /* lock instr */
885 mem_dtlb:7, /* tlb access */
886 mem_rsvd:31;
887 };
888 };
890 /* type of opcode (load/store/prefetch,code) */
891 #define PERF_MEM_OP_NA 0x01 /* not available */
892 #define PERF_MEM_OP_LOAD 0x02 /* load instruction */
893 #define PERF_MEM_OP_STORE 0x04 /* store instruction */
894 #define PERF_MEM_OP_PFETCH 0x08 /* prefetch */
895 #define PERF_MEM_OP_EXEC 0x10 /* code (execution) */
896 #define PERF_MEM_OP_SHIFT 0
897
898 /* memory hierarchy (memory level, hit or miss) */
899 #define PERF_MEM_LVL_NA 0x01 /* not available */
900 #define PERF_MEM_LVL_HIT 0x02 /* hit level */
901 #define PERF_MEM_LVL_MISS 0x04 /* miss level */
902 #define PERF_MEM_LVL_L1 0x08 /* L1 */
903 #define PERF_MEM_LVL_LFB 0x10 /* Line Fill Buffer */
904 #define PERF_MEM_LVL_L2 0x20 /* L2 */
905 #define PERF_MEM_LVL_L3 0x40 /* L3 */
906 #define PERF_MEM_LVL_LOC_RAM 0x80 /* Local DRAM */
907 #define PERF_MEM_LVL_REM_RAM1 0x100 /* Remote DRAM (1 hop) */
908 #define PERF_MEM_LVL_REM_RAM2 0x200 /* Remote DRAM (2 hops) */
909 #define PERF_MEM_LVL_REM_CCE1 0x400 /* Remote Cache (1 hop) */
910 #define PERF_MEM_LVL_REM_CCE2 0x800 /* Remote Cache (2 hops) */
911 #define PERF_MEM_LVL_IO 0x1000 /* I/O memory */
912 #define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */
913 #define PERF_MEM_LVL_SHIFT 5
914
915 /* snoop mode */
916 #define PERF_MEM_SNOOP_NA 0x01 /* not available */
917 #define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */
918 #define PERF_MEM_SNOOP_HIT 0x04 /* snoop hit */
919 #define PERF_MEM_SNOOP_MISS 0x08 /* snoop miss */
920 #define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */
921 #define PERF_MEM_SNOOP_SHIFT 19
922
923 /* locked instruction */
924 #define PERF_MEM_LOCK_NA 0x01 /* not available */
925 #define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */
926 #define PERF_MEM_LOCK_SHIFT 24
927
928 /* TLB access */
929 #define PERF_MEM_TLB_NA 0x01 /* not available */
930 #define PERF_MEM_TLB_HIT 0x02 /* hit level */
931 #define PERF_MEM_TLB_MISS 0x04 /* miss level */
932 #define PERF_MEM_TLB_L1 0x08 /* L1 */
933 #define PERF_MEM_TLB_L2 0x10 /* L2 */
934 #define PERF_MEM_TLB_WK 0x20 /* Hardware Walker*/
935 #define PERF_MEM_TLB_OS 0x40 /* OS fault handler */
936 #define PERF_MEM_TLB_SHIFT 26
937
938 #define PERF_MEM_S(a, s) \
939 (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
940

Related

Why my spi test C code get this result?

At the bottom is the spi test code (spitest.c) I used, and when running it on my linux kit, I got this result:
root#abcd-kit:/system # ./spitest
open device: /dev/spidev0.0
set spi mode: 0
set bits per word: 8
set max speed: 2000000 Hz (2 MHz)
the received data is below:
00 00 00 00 30 30
30 0A 00 00 00 00
00 00 00 00 2F 73
the received data is below:
00 00 00 00 30 30
30 0A 00 00 00 00
00 00 00 00 2F 73
...
dmesg output:
<7>[ 1254.714088] usif-spi e1100000.usif1: Pushing msg a8085ed0
<6>[ 1254.714367] SPI XFER :ae81c700 , Length : 18
<6>[ 1254.714404] TX Buf :a6207000 , TX DMA : (null)
<6>[ 1254.714425] RX Buf :92bf5000 , RX DMA : (null)
<6>[ 1254.714445] CS change:0, bits/w :8, delay : 0 us, speed : 2000000 Hz
<7>[ 1254.714471] TX--->:31 a5 bb 00 00 bb fc 76 80 84 1e 00 5c 29 7d 77
<7>[ 1254.714491] TX--->:44 b9
<7>[ 1254.714511] RX--->:00 00 00 00 30 30 30 0a 00 00 00 00 00 00 00 00
<7>[ 1254.714534] RX--->:2f 73
<7>[ 1254.714558] usif-spi e1100000.usif1: Msg a8085ed0 completed with status 0
<7>[ 1255.725936] usif-spi e1100000.usif1: Pushing msg a8085ed0
<6>[ 1255.726472] SPI XFER :ae81cc40 , Length : 18
<6>[ 1255.726604] TX Buf :a6207000 , TX DMA : (null)
<6>[ 1255.726656] RX Buf :92bf5000 , RX DMA : (null)
<6>[ 1255.726706] CS change:0, bits/w :8, delay : 0 us, speed : 2000000 Hz
<7>[ 1255.726773] TX--->:31 a5 bb 00 00 bb fc 76 94 29 7d 77 5c 29 7d 77
<7>[ 1255.726829] TX--->:44 b9
<7>[ 1255.726875] RX--->:00 00 00 00 30 30 30 0a 00 00 00 00 00 00 00 00
<7>[ 1255.726925] RX--->:2f 73
And the biggest problem is that I cannot get correct result from miso pin (read is wrong, can do write correctly). Whatever I do, e.g. connect miso to ground or 1.8V, it always give this kind of result. The fisrt 5 data are always zero (I think it is because tx buffer has size of 5 and it is half duplex), and then followed random data, even that I used memset() to set rx buffer data to be zero before each spi transfer. And if I stop the program and run it again, the data changed but they are still random.
How could I read correct data from miso pin?
Thanks!
spitest.c
/*
* SPI testing utility (using spidev driver)
*
* Copyright (c) 2007 MontaVista Software, Inc.
* Copyright (c) 2007 Anton Vorontsov <avorontsov#ru.mvista.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License.
*
* Cross-compile with cross-gcc -I/path/to/cross-kernel/include
*/
#include <stdint.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <linux/types.h>
#include "spidev.h"
#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
static void pabort(const char *s)
{
perror(s);
abort();
}
static const char *device = "/dev/spidev0.0";
static uint8_t mode;
static uint8_t bits = 8;
static uint32_t speed = 2000000;
static uint16_t delay;
#define LENGTH 18
static void transfer(int fd)
{
int ret, i;
uint8_t tx[5] = {0x31, 0xa5, 0xbb};
uint8_t rx[LENGTH] = {0, };
struct spi_ioc_transfer tr = {
.tx_buf = (unsigned long)tx,
.rx_buf = (unsigned long)rx,
.len = LENGTH,
.delay_usecs = delay,
.speed_hz = speed,
.bits_per_word = bits, //important, bits = 8 means byte transfer is possible
};
memset(rx, 0, LENGTH);
ret = ioctl(fd, SPI_IOC_MESSAGE(1), &tr);
if (ret < 1)
pabort("can't send spi message\n");
printf("the received data is below:\n");
for (ret = 0; ret < LENGTH; ret++) { //print the received data, by Tom Xue
if (!(ret % 6))
puts("");
printf("%.2X ", rx[ret]);
}
puts("");
}
int main(int argc, char *argv[])
{
int ret = 0;
int fd;
unsigned char rd_buf[32];
fd = open(device, O_RDWR);
if (fd < 0)
pabort("can't open device\n");
/*
* * spi mode
* */
ret = ioctl(fd, SPI_IOC_WR_MODE, &mode);
if (ret == -1)
pabort("can't set spi mode\n");
ret = ioctl(fd, SPI_IOC_RD_MODE, &mode);
if (ret == -1)
pabort("can't get spi mode\n");
/*
* * bits per word
* */
ret = ioctl(fd, SPI_IOC_WR_BITS_PER_WORD, &bits);
if (ret == -1)
pabort("can't set bits per word\n");
ret = ioctl(fd, SPI_IOC_RD_BITS_PER_WORD, &bits);
if (ret == -1)
pabort("can't get bits per word\n");
/*
* * max speed hz
* */
ret = ioctl(fd, SPI_IOC_WR_MAX_SPEED_HZ, &speed);
if (ret == -1)
pabort("can't set max speed hz\n");
ret = ioctl(fd, SPI_IOC_RD_MAX_SPEED_HZ, &speed);
if (ret == -1)
pabort("can't get max speed hz\n");
printf("open device: %s\n", device);
printf("set spi mode: %d\n", mode);
printf("set bits per word: %d\n", bits);
printf("set max speed: %d Hz (%d MHz)\n", speed, speed/1000000);
while(1){
transfer(fd);
//read(fd, rd_buf, 4);
//printf("rd_buf = %s, %d, %d, %d, %d\n", rd_buf, rd_buf[0], rd_buf[1], rd_buf[2], rd_buf[3]);
//memset(rd_buf, 0, 10);
sleep(1);
}
close(fd);
return ret;
}
More:
My CPU is Intel Sofia-3gr, I guess its spec is not publicly released. I see the Tx data from my oscilloscope, and confirmed that TX is right.
I can also printk the pinmux/pinctrl setting (use ioremap and ioread32), it is also right. I say it right also because that I can see how to set it as SPI from those dts reference files, I just follow them.
Key:
I just find that the SPI TX interrupt is pending each time a spi transfer starts, but no SPI RX interrupt pending. Hence the spi driver code will not read the RX data at all. As the reason, I don't know.

Deadlock in MCS lock implementation

Hardware:
Darwin Kernel Version 13.2.0: Thu Apr 17 23:03:13 PDT 2014; root:xnu-2422.100.13~1/RELEASE_X86_64 x86_64
atomics.hpp
1 #ifndef ATOMIC_UTILS_H
2 #define ATOMIC_UTILS_H
3
4 #include
5
6 #define BARRIER() __asm__ volatile ( "": : :"memory" )
7
8 #define CPU_RELAX() __asm__ volatile( "pause\n\t": : :"memory" )
9
10 #define STORE_FENCE() __asm__ volatile("mfence" ::: "memory");
11
12 class AtomicUtils
13 {
14 public:
15
16 /**
17 * check if the value at addr is equal to oldval, if so replace it with newva l
18 * and return the oldval
19 */
20 inline static size_t compareAndExchange( volatile size_t* addr, size_t oldval , size_t newval )
21 {
22 size_t ret;
23 __asm__ volatile( "lock cmpxchgq %2, %1\n\t"
24 :"=a"(ret), "+m"(*addr)
25 : "r"(newval), "0"(oldval)
26 : "memory" );
27 return ret;
28 }
29
30 /**
31 * Atomically stores x into addr and returns the previous
32 * stored in addr
33 */
34 inline static size_t loadAndStore( size_t x, volatile size_t* addr )
36 {
37 size_t ret;
38 __asm__ volatile( "lock xchgq %1, %0\n\t"
39 : "+m"(*addr), "=r"(ret)
40 : "1"(x) );
41 return ret;
42 }
43
44 };
45
46 #endif
mcs.hpp
1 #ifndef MCS_LOCK_H
2 #define MCS_LOCK_H
3
4 #include "atomics.hpp"
5 #include
6
7 class MCSLock
8 {
9 struct mcs_lock_t
10 {
11 mcs_lock_t():next(0), locked(false){}
12 struct mcs_lock_t* next;
13 bool locked;
14 };
15
16 public:
17 typedef struct mcs_lock_t mcs_lock;
18
19 private:
20 mcs_lock** tail;
21 static boost::thread_specific_ptr tls_node;
22
23 public:
24 MCSLock( mcs_lock** lock_tail ):tail( lock_tail )
25 {
26 if( tls_node.get() == 0 )
27 tls_node.reset( new mcs_lock() );
28 }
29
30 void lock()
31 {
32 mcs_lock* thread_node = tls_node.get();
33 thread_node->next = 0;
34 thread_node->locked = true;
35
36 volatile mcs_lock* pred = reinterpret_cast(
37 AtomicUtils::loadAndStore(
38 reinterpret_cast( thread_node ),
39 reinterpret_cast( tail )
40 )
41 );
42 if( pred != 0 )
43 {
44 pred->next = *tail;
45
46 STORE_FENCE();
47 //BARRIER(); // Required to prevent re ordering between prev->next = tail and thread_node->locked. ( WR harzard )
48
49 // Spin on a local variable. Someone unlock me plz !!
50 while( thread_node->locked )
51 CPU_RELAX();
52
53 }
54 }
55
56 void unlock()
57 {
58 mcs_lock* thread_node = tls_node.get();
59 if( thread_node->next == 0 )
60 {
61 // If false, then we a new thread has request for lock. Now release t he lock for the new thread
62 if(
63 AtomicUtils::compareAndExchange(
64 reinterpret_cast( tail ),
65 reinterpret_cast( thread_node ),
66 0
67 ) == reinterpret_cast( thread_node ) 68 )
69 {
70 return;
71 }
72
73 while( thread_node->next == 0 )
74 CPU_RELAX();
75 }
76
77 thread_node->next->locked = false;
78 }
79 };
80
81 boost::thread_specific_ptr MCSLock::tls_node;
82 #endif
mcs_test.cpp
1 #include "mcs.hpp"
2 #include <iostream>
3 #include <pthread.h>
4 #include <vector>
5 #define NUM_THREADS 16
6 #define NUM_ITERATIONS 100
7
8 std::vector<int> elements;
9 MCSLock::mcs_lock *tail = 0;
10
11 void* thread_run( void* data )
12 {
13 MCSLock lock( &tail );
14 for( int i = 0; i < NUM_ITERATIONS; ++i )
15 {
16 lock.lock();
17 elements.push_back( i );
18 lock.unlock();
19 }
20
21 return 0;
22 }
23
24 int main()
25 {
26 pthread_t threads[ NUM_THREADS ];
27 elements.reserve( NUM_THREADS * NUM_ITERATIONS );
28
29 {
30 for( int i = 0; i < NUM_THREADS; ++i )
31 pthread_create( &threads[i], NULL, thread_run, NULL );
32
33 for( int i = 0; i < NUM_THREADS; ++i )
34 pthread_join( threads[i], NULL );
35
36 std::cout <<"\nExiting main thread: " << std::endl;
37 }
38 }
The above code is compiled using clang
Problem:
I see that 1 or 2 threads are stuck in lock() in line 50. Except the main threads, the threads which are stuck in lock() there are no other threads alive. This means that when the other threads invoke unlock() they somehow don't set the locked = false for other variables and exit.
Any pointers on debugging this please ?
Stuck on this for many hours and no clues.
Doesn't clang have builtins for these inline-asm blocks (like gcc's __sync_val_compare_and_swap)? Why re-invent the wheel?
Second, I'd really think about adding the memory clobber to loadAndStore. You need to make sure that any writes the compiler is holding in registers gets flushed to memory before doing the xchgq. Similarly it will prevent gcc from optimizing memory reads to before the xchgq. Either would be bad.
Third, I'd examine the asm output for your while loops (thread_node->locked and thread_node->next). Since these variables are not volatile, gcc may optimize this to only perform the read once.
These may not solve your problem, but that's where I'd start.

PCI driver to fetch MAC address

I was trying to write a pci driver which can display the MAC address of my Ethernet card.
Running a Ubuntu on VM and my Ethernet card is Intel one as follows
00:08.0 Ethernet controller: Intel Corporation 82540EM Gigabit Ethernet Controller (rev 02)
I was able to get the data sheet of the same from Intel website and as per data sheet it says IO address are mapped to Bar 2 (Refer to pg 87) and MAC can be read using RAL/RAH register which are at offset RAL (05400h + 8*n; R/W) and RAH (05404h + 8n; R/W)
2 18h IO Register Base Address (bits 31:2) 0b mem
Based on this information, i wrote a small PCI driver but i always get the MAC as fff and when i debugged further, i see io_base address is always zero.
Below is the code
1 /*
2 Program to find a device on the PCI sub-system
3 */
4 #define VENDOR_ID 0x8086
5 #define DEVICE_ID 0x100e
6
7 #include <linux/kernel.h>
8 #include <linux/module.h>
9 #include <linux/stddef.h>
10 #include <linux/pci.h>
11 #include <linux/init.h>
12 #include <linux/cdev.h>
13 #include <linux/device.h>
14 #include <asm/io.h>
15
16 #define LOG(string...) printk(KERN_INFO string)
17
18 #define CDEV_MAJOR 227
19 #define CDEV_MINOR 0
20
21
22 MODULE_LICENSE("GPL");
23
24 struct pci_dev *pci_dev;
25 unsigned long mmio_addr;
26 unsigned long reg_len;
27 unsigned long *base_addr;
28
29 int device_probe(struct pci_dev *dev, const struct pci_device_id *id);
30 void device_remove(struct pci_dev *dev);
31
32 struct pci_device_id pci_device_id_DevicePCI[] =
33 {
34 {VENDOR_ID, DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
35 };
36
37 struct pci_driver pci_driver_DevicePCI =
38 {
39 name: "MyPCIDevice",
40 id_table: pci_device_id_DevicePCI,
41 probe: device_probe,
42 remove: device_remove
43 };
44
45
46 int init_module(void)
47 {
48 //struct pci_dev *pdev = NULL;
49 int ret = 0;
50
51 pci_register_driver(&pci_driver_DevicePCI);
52
53 return ret;
54 }
55
56 void cleanup_module(void)
57 {
58 pci_unregister_driver(&pci_driver_DevicePCI);
59
60 }
61
62 #define REGISTER_OFFSET 0x05400
64 int device_probe(struct pci_dev *dev, const struct pci_device_id *id)
65 {
66 int ret;
67 int bar = 2; // Bar to be reserved
68 unsigned long io_base = 0;
69 unsigned long mem_len = 0;
70 unsigned int register_data = 0;
71
72 LOG("Device probed");
73
74 /* Reserve the access to PCI device */
75 ret = pci_request_region(dev, bar, "my_pci");
76 if (ret) {
77 printk(KERN_ERR "request region failed :%d\n", ret);
78 return ret;
79 }
80
81 ret = pci_enable_device(dev);
82 if (ret < 0 ) LOG("Failed while enabling ... ");
83
84 io_base = pci_resource_start(dev, bar);
85 mem_len = pci_resource_len(dev, bar);
86
87 request_region(io_base, mem_len, "my_pci");
88 register_data = inw(io_base + REGISTER_OFFSET);
89 printk(KERN_INFO "IO base = %lx", io_base);
90 printk(KERN_INFO "MAC = %x", register_data);
91
92 return ret;
93 }
94
95 void device_remove(struct pci_dev *dev)
96 {
97 pci_release_regions(dev);
98 pci_disable_device(dev);
99 }
100
lspci -x output of my card
00:08.0 Ethernet controller: Intel Corporation 82540EM Gigabit Ethernet Controller (rev 02)
00: 86 80 0e 10 07 00 30 02 02 00 00 02 00 40 00 00
10: 00 00 82 f0 00 00 00 00 41 d2 00 00 00 00 00 00
20: 00 00 00 00 00 00 00 00 00 00 00 00 86 80 1e 00
30: 00 00 00 00 dc 00 00 00 00 00 00 00 09 01 ff 00
Can any one let me know what am i doing wrong?
I've modified your code and commented on changes. I have removed all of your existing comments to avoid confusion, and have only modified your probe function.
/* We need a place to store a logical address for unmapping later */
static void* logical_address;
int device_probe(struct pci_dev *dev, const struct pci_device_id *id)
{
int ret;
int bar_mask; /* BAR mask (this variable) and the integer BAR */
int requested_bar = 2; /* (this variable) are not the same thing, so give them */
/* separate variables */
resource_size_t io_base = 0; /* use kernel macros instead of built-in datatypes */
resource_size_t mem_len = 0;
unsigned int register_data = 0;
LOG("Device probed");
/* add this call to get the correct BAR mask */
bar_mask = pci_select_bars(dev, 0);
/* switched order - enable device before requesting memory */
ret = pci_enable_device(dev);
if (ret < 0 ) LOG("Failed while enabling ... ");
/* for this call, we want to pass the BAR mask, NOT the integer bar we want */
ret = pci_request_region(dev, bar_mask, "my_pci");
if (ret) {
printk(KERN_ERR "request region failed :%d\n", ret);
return ret;
}
/* it is in THESE calls that we request a specific BAR */
io_base = pci_resource_start(dev, requested_bar);
mem_len = pci_resource_len(dev, requested_bar);
/* you don't need to request anything again, so get rid of this line: */
/* request_region(io_base, mem_len, "my_pci"); */
/* you're missing an important step: we need to translate the IO address
* to a kernel logical address that we can actually use. Add a call to
* ioremap()
*/
logical_address = ioremap(io_base, mem_len);
/* we need to use the logical address returned by ioremap(), not the physical
* address returned by resource_start
*/
register_data = inw(logical_address + REGISTER_OFFSET);
printk(KERN_INFO "IO base = %lx", io_base);
printk(KERN_INFO "MAC = %x", register_data);
return ret;
}
You will need to add a corresponding call to iounmap() in your device_remove() routine. Take a look at the Intel E100E driver source code for some good examples.

linux kernel ip_options_build() function

Below is the ip_options_build() in linux kernel 3.4, line 51 and 52:
51 if (opt->srr)
52 memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
I understand that the two lines say, if source routing option is present, copy the destination address to the end of the option, that suggests that iph[opt->srr+1] is the length of the source routing option, but I don't get it why?
31/*
32 * Write options to IP header, record destination address to
33 * source route option, address of outgoing interface
34 * (we should already know it, so that this function is allowed be
35 * called only after routing decision) and timestamp,
36 * if we originate this datagram.
37 *
38 * daddr is real destination address, next hop is recorded in IP header.
39 * saddr is address of outgoing interface.
40 */
41
42void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
43 __be32 daddr, struct rtable *rt, int is_frag)
44{
45 unsigned char *iph = skb_network_header(skb);
46
47 memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
48 memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
49 opt = &(IPCB(skb)->opt);
50
51 if (opt->srr)
52 memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
53
54 if (!is_frag) {
55 if (opt->rr_needaddr)
56 ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt);
57 if (opt->ts_needaddr)
58 ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);
59 if (opt->ts_needtime) {
60 struct timespec tv;
61 __be32 midtime;
62 getnstimeofday(&tv);
63 midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC
+ tv.tv_nsec / NSEC_PER_MSEC);
64 memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
65 }
66 return;
67 }
68 if (opt->rr) {
69 memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]);
70 opt->rr = 0;
71 opt->rr_needaddr = 0;
72 }
73 if (opt->ts) {
74 memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]);
75 opt->ts = 0;
76 opt->ts_needaddr = opt->ts_needtime = 0;
77 }
78}
If I remember correctly, iph + opt->srr is basically the address of the first byte of the srr option. The format of the option itself is as follows:
TYPE (1 byte) | LENGTH (1 byte) | OFFSET (1 byte) | ... and then some addresses 4 bytes each
The LENGTH "field" specifies the length in bytes of the entire option, so that's why iph[opt->srr+1] is the length of the option.

Why doesn't fill the PTE entry in page fault for vmalloc on ARM Linux Kernel?

When page fault happens on VMALLOC_START~VMALLOC_END, why do_translation_fault does not fill Page table entry and just fill PG, PUD and PMD?
Corresponding source code #do_translation_fault in arch/arm/mm/fault.c:
414 static int __kprobes
415 do_translation_fault(unsigned long addr, unsigned int fsr,
416 struct pt_regs *regs)
417 {
418 unsigned int index;
419 pgd_t *pgd, *pgd_k;
420 pud_t *pud, *pud_k;
421 pmd_t *pmd, *pmd_k;
422
423 if (addr < TASK_SIZE)
424 return do_page_fault(addr, fsr, regs);
425
426 if (user_mode(regs))
427 goto bad_area;
428
429 index = pgd_index(addr);
430
431 /*
432 * FIXME: CP15 C1 is write only on ARMv3 architectures.
433 */
434 pgd = cpu_get_pgd() + index;
435 pgd_k = init_mm.pgd + index;
436
437 if (pgd_none(*pgd_k))
438 goto bad_area;
439 if (!pgd_present(*pgd))
440 set_pgd(pgd, *pgd_k);
441
442 pud = pud_offset(pgd, addr);
443 pud_k = pud_offset(pgd_k, addr);
444
445 if (pud_none(*pud_k))
446 goto bad_area;
447 if (!pud_present(*pud))
448 set_pud(pud, *pud_k);
449
450 pmd = pmd_offset(pud, addr);
451 pmd_k = pmd_offset(pud_k, addr);
452
453 #ifdef CONFIG_ARM_LPAE
454 /*
455 * Only one hardware entry per PMD with LPAE.
456 */
457 index = 0;
458 #else
459 /*
460 * On ARM one Linux PGD entry contains two hardware entries (see page
461 * tables layout in pgtable.h). We normally guarantee that we always
462 * fill both L1 entries. But create_mapping() doesn't follow the rule.
463 * It can create inidividual L1 entries, so here we have to call
464 * pmd_none() check for the entry really corresponded to address, not
465 * for the first of pair.
466 */
467 index = (addr >> SECTION_SHIFT) & 1;
468 #endif
469 if (pmd_none(pmd_k[index]))
470 goto bad_area;
471
472 copy_pmd(pmd, pmd_k);
473 return 0;
474
475 bad_area:
476 do_bad_area(addr, fsr, regs);
477 return 0;
478 }
This range is reserved for kernel memory, allocated using vmalloc.
Kernel memory is normally accessed when IRQs (soft or hard) are disabled, and page faults can't be handled.
The vmalloc function takes care to create the mapping in advance, so there will be no fault on access.
If there was a fault, it's because the access is to memory which wasn't allocated (or was freed), so it can't be handled.

Resources