Tuesday, March 12, 2013

pm-cpuidle: High level code flow from bootup

When the kernel boots up, it starts as a single thread for which PID = 0. It is named as "swapper". This eventually calls cpu_idle() (which is architecture specific function) and this task is nothing but idle task.

init/main.c
start_kernel(void)
{
  ...
  rest_init();

  This is the last function that is called after all inits are called.
}


rest_init()
{
  ...
  /* We need to spawn init first so that it obtains pid 1, however
   * the init task will end up wanting to create kthreads, which, if
   * we schedule it before we create kthreadd, will OOPS.
   */
  kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);    PID = 1
  ...
  pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);   PID = 2
  ...
  /*
   * The boot idle thread must execute schedule()
   * at least once to get things moving:
   */
  init_idle_bootup_task(current);
  schedule_preempt_disabled();
  /* Call into cpu_idle with preempt disabled */
  cpu_idle();     PID = 0

}

arch/arm/kernel/process.c
cpu_idle()
 
{
  ...
  /* endless idle loop with no priority at all */
  while (1) {

    ...
    while (!need_resched()) {
      ...
      if (cpuidle_idle_call())     
        pm_idle()
      If CPU_IDLE is enabled, in non-error case, cpuidle_idle_call()
      returns 0. Thereby pm_idle() is not called. Determining and
      entering the sleep state is done in cpuidle_idle_call()
      itself.
      ...
    }
  }
}

xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
pm_idle()
 cpu_do_idle() => cpu_v7_do_idle

arch/arm/mm/proc-v7.S
 ENTRY(cpu_v7_do_idle)
    dsb           @ WFI may enter a low-power mode
    wfi
    mov     pc, lr
 ENDPROC(cpu_v7_do_idle)
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx


drivers/cpuidle/cpuidle.c
int cpuidle_idle_call(void)
{
  ...
  /* ask the governor for the next state */
 
  next_state = cpuidle_curr_governor->select(drv, dev);
  Current cpuidle governer identifies the appropriate idle
  state. Platform data is used by the governor to select the
  approprite sleep state.
  ...
  entered_state = cpuidle_enter_ops(dev, drv, next_state);

  Here we enter the idle state suggested by current cpuidle
  governor. Platform specific code comes into picture to set
  the appropriate idle state. 
  ...
}



[Based on Linux kernel v3.4]

Saturday, March 9, 2013

ARM Linux: Switching page tables during context switch


Every process (except kernel threads) in Linux has a page table of it's own.

For 32-bit ARM without LPAE, 2-level page table is used.
For 32-bit ARM with LPAE, 3-level page table is used.
 
In arch/arm/mm/proc-v7.S:
#ifdef CONFIG_ARM_LPAE
#include "proc-v7-3level.S"
#else
#include "proc-v7-2level.S"
#endif


 

Let's look at the case of 32-bit ARM without LPAE, which uses 2-Level page table
  • Level-1: pgd: page global directory
  • Level-2: pte: page table entry

pgd base for each process is embedded in task_struct as shown below:

struct task_struct {   :include/linux/sched.h
    ...
    struct mm_struct *mm
    ...
    }

struct mm_struct{      :include/linux/mm_types.h
    ...
    pgd_t * pgd;
    ...
    }

TTBR: Translation Table Base Register - Check ARM spec for more details about this register
LAPE: Large Physical Address Extension

When ever context switch happens in Linux, the pgd base of the next process has to be stored in TTBR (note that this is not done while switching to kernel threads as the kernel threads doesn't have a mm struct of it's own). Here is the code flow for how it is done.

schedule() :kernel/sched/core.c
     __schedule()

__schedule() :kernel/sched/core.c
    context_switch()

context_switch() :kernel/sched/core.c
    switch_mm()


switch_mm() :arch/arm/include/asm/mmu_context.h
    cpu_switch_mm(next->pgd, next);

    #define cpu_switch_mm(pgd,mm) cpu_do_switch_mm(virt_to_phys(pgd),mm)
    #define cpu_do_switch_mm processor.switch_mm

    processor.switch_mm is cpu_v7_switch_mm() in case of ARMv7

        struct processor { :arch/arm/include/asm/proc-fns.h
        ...
        /*
         * Set the page table
         */
        void (*switch_mm)(unsigned long pgd_phys, struct mm_struct *mm);
        ...
        }
 
    Check arch/arm/include/asm/glue-proc.h
       #define CPU_NAME cpu_v7
       #define cpu_do_switch_mm   __glue(CPU_NAME,_switch_mm)


Here is some info from ARM spec for accessing TTBR register:
CP15 register: c2
Register name: TTBR0, Translation Table Base Register 0
Affected operation: MCR p15, 0, <Rd>, c2, c0, 0
 
cpu_v7_switch_mm(mm->pgd, mm) :arch/arm/mm/proc-v7-2level.S
    ...
    mcr     p15, 0, r0, c2, c0, 0           @ set TTB 0

This is the instruction which sets TTBR0 to point to the page table of the next process. 

[Based on Linux kernel v3.4]

Thursday, February 28, 2013

Atomic function implementation in ARM architecture


LDREX and STREX are the instructions on which atomic functions built on. Let's look briefly at those two instructions.



LDREX
 

LDREX loads data from memory.
  • If the physical address has the Shared TLB attribute, LDREX tags the physical address as exclusive access for the current processor, and clears any exclusive access tag for this processor for any other physical address.
  • Otherwise, it tags the fact that the executing processor has an outstanding tagged physical address.


STREX

STREX performs a conditional store to memory. The conditions are as follows:
  • If the physical address does not have the Shared TLB attribute, and the executing processor has an outstanding tagged physical address, the store takes place and the tag is cleared.
  • If the physical address does not have the Shared TLB attribute, and the executing processor does not have an outstanding tagged physical address, the store does not take place.
  • If the physical address has the Shared TLB attribute, and the physical address is tagged as exclusive access for the executing processor, the store takes place and the tag is cleared.
  • If the physical address has the Shared TLB attribute, and the physical address is not tagged as exclusive access for the executing processor, the store does not take place.


LDREX{size}{cond} Rd, {Rd2,} [Rn {, #offset}]
STREX{size}{cond} Rd, Rm, {Rm2,} [Rn {, #offset}]

Rd
is the destination register. After the instruction, this contains:
  • for LDREX, the data loaded from memory
  • for STREX, either:
    • 0: if the instruction succeeds
    • 1: if the instruction is locked out.


For more information about the above mentioned ARM instructions, refer the following link:
http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204g/Cihbghef.html


The following discussion is based on the assumption that the you are aware of the inline assembly in gcc. If you are not aware of the inline assembly in gcc, check out the follwoing links.
http://www.ethernut.de/en/documents/arm-inline-asm.html
http://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html


There are various atomic functions like:
  • atomic_add( )
  • atomic_sub( )
  • atomic_cmpxchg( )
  • atomic_clear_mask( )
  • atomic_inc()
  • atomic_dec()
  • atomic_inc_and_test()
  • atomic_dec_and_test() etc


All of them are based on the same principle of using LDREX and STREX. Let's look at the implementation of atomic_add() and you can check the rest of the fucntions in arch/arm/include/asm/atomic.h which are almost similar in implementation.

/*
 * ARMv6 UP and SMP safe atomic ops.  We use load exclusive and
 * store exclusive to ensure that these are atomic.  We may loop
 * to ensure that the update happens.
 */
static inline void atomic_add(int i, atomic_t *v)
{
        unsigned long tmp;
        int result;

        __asm__ __volatile__("@ atomic_add\n"
"1:     ldrex   %0, [%3]\n"
"       add     %0, %0, %4\n"
"       strex   %1, %0, [%3]\n"
"       teq     %1, #0\n"
"       bne     1b"
        : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
        : "r" (&v->counter), "Ir" (i)
        : "cc");
}



Here is the typedef for atomic variable:

typedef struct {
        int counter;
} atomic_t;


Let's go through instruction by instruction.


Scenario 1: atomic_add() succeeded in one go as no one else is accessing that variable which has to be incremented atomically.

ldrex   %0, [%3]
The address of the variable that has to incremented is in %3. The value at this address is copied into %0 i.e. 'result'.
LDREX tags the physical address as exclusive access for the current processor, and clears any exclusive access tag for this processor for any other physical address.

add     %0, %0, %4
Adds i to the vaiable which we want to increment.

strex   %1, %0, [%3]
Stores the incremented value in the memory and writes 0 into %1 i.e. 'tmp'.

teq     %1, #0
Sets the Z bit in CPSR as 'tmp' is 0.

bne     1b
As the Z bit in CPSR is set, this doesn't branch to the label '1:' and proceeds further.



Scenario 2: atomic_add() is not succeeded in first pass, but may succeed in one of the later passes. This happens when some one else is atomically accessing this variable.

First pass:

ldrex   %0, [%3]
The address of the variable that has to incremented is in %3. The value at this address is copied into %0 i.e. 'result'.
In the 1st pass, it tags the fact that the executing processor has an outstanding tagged physical address and LDREX can't tag the physical address as exclusive access for the current processor.

add     %0, %0, %4
Adds i to the vaiable which we want to increment.

strex   %1, %0, [%3]
Fails to store the incremented value in the memory as  the physical address is not tagged as exclusive access for the executing processor and writes 1 into %1 i.e. 'tmp'.

teq     %1, #0
Clears the Z bit in CPSR as 'tmp' is 1.

bne     1b
As the Z bit in CPSR is cleared, this branches to the label '1:' and repeats tha above instuctions.


2nd or later pass:

If no one is accessing the vaiable that we want to increment atomically then the code flow is same as Scenario 1, other wise the code flow is same as Scenraio 2: First pass.





Monday, February 25, 2013

Mutex implementation in ARM architecture


LDREX and STREX are the instructions on which mutex implementation is built on. Let's look briefly at those two instructions.


 
LDREX


LDREX loads data from memory.
  • If the physical address has the Shared TLB attribute, LDREX tags the physical address as exclusive access for the current processor, and clears any exclusive access tag for this processor for any other physical address.
  • Otherwise, it tags the fact that the executing processor has an outstanding tagged physical address.


STREX

STREX performs a conditional store to memory. The conditions are as follows:
  • If the physical address does not have the Shared TLB attribute, and the executing processor has an outstanding tagged physical address, the store takes place and the tag is cleared.
  • If the physical address does not have the Shared TLB attribute, and the executing processor does not have an outstanding tagged physical address, the store does not take place.
  • If the physical address has the Shared TLB attribute, and the physical address is tagged as exclusive access for the executing processor, the store takes place and the tag is cleared.
  • If the physical address has the Shared TLB attribute, and the physical address is not tagged as exclusive access for the executing processor, the store does not take place.


LDREX{size}{cond} Rd, {Rd2,} [Rn {, #offset}]
STREX{size}{cond} Rd, Rm, {Rm2,} [Rn {, #offset}]

Rd
is the destination register. After the instruction, this contains:
  • for LDREX, the data loaded from memory
  • for STREX, either:
    • 0: if the instruction succeeds
    • 1: if the instruction is locked out.


For more information about the above mentioned ARM instructions, refer the following link:
http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204g/Cihbghef.html


The following discussion is based on the assumption that the you are aware of the inline assembly in gcc. If you are not aware of the inline assembly in gcc, check out the following links.
http://www.ethernut.de/en/documents/arm-inline-asm.html
http://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html



mutex_lock( ) in kernel/mutex.c

void __sched mutex_lock(struct mutex *lock)
{
        might_sleep();
        /*
         * The locking fastpath is the 1->0 transition from
         * 'unlocked' into 'locked' state.
         */
        __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
        mutex_set_owner(lock);
}

__mutex_lock_slowpath( ) is the function which places the current task in the wait queue and this function is called only if the mutex is not acquired.

Let's look at __mutex_fastpath_lock( ) which is architecture specific (arch/arm/include/asm/mutex.h).

static inline void
__mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
{
        int __ex_flag, __res;

        __asm__ (

                "ldrex  %0, [%2]        \n\t"
                "sub    %0, %0, #1      \n\t"
                "strex  %1, %0, [%2]    "

                : "=&r" (__res), "=&r" (__ex_flag)
                : "r" (&(count)->counter)
                : "cc","memory" );

        __res |= __ex_flag;
        if (unlikely(__res != 0))
                fail_fn(count);
}


In the above code, %1 i.e. __ex_flag contain the result of STREX. If the STEX instruction is locked out, then %1 i.e. __ex_flag contain 1. Thereby __res is set to non-zero, as __res |= __ex_flag;. In this case (i.e. we didn't succeed in acquiring the lock),  we will call fail_fn() which is __mutex_lock_slowpath( ) and put the current task in the waitqueue and go to sleep.

In the above code, %1 i.e. __ex_flag contain the result of STREX. If the STEX instruction succeed, then %1 i.e. __ex_flag contain 0. If it mutex is available, then __res is set to zero, as __res |= __ex_flag;. In this case (i.e. we succeeded in acquiring the lock),  we don't call fail_fn() which is __mutex_lock_slowpath( ) and current task cotinues.



mutex_unlock( )  in kernel/mutex.c

void __sched mutex_unlock(struct mutex *lock)
{
        /*
         * The unlocking fastpath is the 0->1 transition from 'locked'
         * into 'unlocked' state:
         */
...
__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
}

__mutex_unlock_slowpath( ) is the function which wakes up the tasks that are waiting on this mutex.

Let's look at __mutex_fastpath_unlock( ) which is architecture specific (arch/arm/include/asm/mutex.h).

static inline void
__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
{
        int __ex_flag, __res, __orig;

        __asm__ (

                "ldrex  %0, [%3]        \n\t"
                "add    %1, %0, #1      \n\t"
                "strex  %2, %1, [%3]    "

                : "=&r" (__orig), "=&r" (__res), "=&r" (__ex_flag)
                : "r" (&(count)->counter)
                : "cc","memory" );

        __orig |= __ex_flag;
        if (unlikely(__orig != 0))
                fail_fn(count);
}

In the similar lines as explained for __mutex_fastpath_lock( ), the fail_fn() i.e. __mutex_unlock_slowpath will be called if the mutex is unlocked successfully.

Check the following files for more information:

  • arch/arm/include/asm/mutex.h
  • kernel/mutex.c


Spinlock implementation in ARM architecture


SEV and WFE are the main instructions used for implementing spinlock in case of ARM architecture. Let's look briefly at those two instructions before looking into actual spinlock implementation.

SEV

SEV causes an event to be signaled to all cores within a multiprocessor system. If SEV is implemented, WFE must also be implemented.

WFE

If the Event Register is not set, WFE suspends execution until one of the following events occurs:
  • an IRQ interrupt, unless masked by the CPSR I-bit
  • an FIQ interrupt, unless masked by the CPSR F-bit
  • an Imprecise Data abort, unless masked by the CPSR A-bit
  • a Debug Entry request, if Debug is enabled
  • an Event signaled by another processor using the SEV instruction.

In case of spin_lock_irq( )/spin_lock_irqsave( ),
  • as IRQs are disabled, the only way to to resume after WFE intruction has executed is to execute SEV instruction on some other core.

In case of spin_lock( ),
  • If IRQs are enabled even before we had called spin_lock( ) and we executed WFE and execution got suspended,
    • Scenario 1: Interrupt occured and handled; we resume, but as the lock was still unreleased, we will loopback and execute WFE.
    • Scenario 2: Some other core executed WFE and released some other lock (but didn't release our lock); we resume; as the lock is still unreleased, we will loopback and execute WFE.
    • Scenario 3: Some other core executed WFE and released this lock; we resume; as the lock was released, we will acquire the lock.
  • If IRQs are disabled before calling spin_lock(), then the situation is same as spin_lock_irqsave().

In case of spin_unlock( ),
  • lock is released and SEV instruction is executed.


Check out the following code snippets for actual implementation:


static inline void arch_spin_lock(arch_spinlock_t *lock)
{
        unsigned long tmp;

        __asm__ __volatile__(
"1:     ldrex   %0, [%1]\n"
"       teq     %0, #0\n"
        WFE("ne")
"       strexeq %0, %2, [%1]\n"
"       teqeq   %0, #0\n"
"       bne     1b"
        : "=&r" (tmp)
        : "r" (&lock->lock), "r" (1)
        : "cc");

        smp_mb();
}



static inline void arch_spin_unlock(arch_spinlock_t *lock)
{
        smp_mb();

        __asm__ __volatile__(
"       str     %1, [%0]\n"
        :
        : "r" (&lock->lock), "r" (0)
        : "cc");

        dsb_sev();
}


static inline void dsb_sev(void)
{
#if __LINUX_ARM_ARCH__ >= 7
        __asm__ __volatile__ (
                "dsb\n"
                SEV
        );
#else
        __asm__ __volatile__ (
                "mcr p15, 0, %0, c7, c10, 4\n"
                SEV
                : : "r" (0)
        );
#endif
}


For more information, check arch/arm/include/asm/spinlock.h in Linux kernel source code. The above code snippet is from 3.4 kernel.

Sunday, February 17, 2013

C Program: Checking whether a linked list is circular

bool IsCircular(Node* head)
{
      bool circleFound = false;
      bool tailFound = false;

      if (head && head->next)
     {
          Node* slower = head;
          Node* faster = head;

          do
          {
               slower = slower->next;
               faster =  faster->next->next;
               tailFound = ! (faster && faster->next);
               if ( ! tailFound )
               {
                   circleFound = (slower == faster || slower == faster->next);                                        
               }
          }
          while ( ! tailFound && ! circleFound)
     }

     return ( circleFound );
}