Merge remote-tracking branch 'origin/davek' into aks_clus_int
authorAkshay Giridhar <akshay87@vt.edu>
Fri, 8 Aug 2014 22:03:42 +0000 (18:03 -0400)
committerAkshay Giridhar <akshay87@vt.edu>
Fri, 8 Aug 2014 22:03:42 +0000 (18:03 -0400)
Conflicts:
arch/x86/mm/fault.c
include/linux/process_server.h
kernel/process_server.c
mm/mmap.c
mm/mprotect.c
mm/mremap.c

1  2 
arch/x86/mm/fault.c
include/linux/process_server.h
include/popcorn/init.h
kernel/process_server.c
mm/mprotect.c
mm/mremap.c

Simple merge
  #define PROCESS_SERVER_CLONE_FAIL 1
  
  //configuration
 -//#define SUPPORT_FOR_CLUSTERING
 -#undef SUPPORT_FOR_CLUSTERING
 +#define SUPPORT_FOR_CLUSTERING
 +//#undef SUPPORT_FOR_CLUSTERING
  
+ //#define PROCESS_SERVER_USE_KMOD
+ #undef PROCESS_SERVER_USE_KMOD
+ #define PROCESS_SERVER_ENFORCE_VMA_MOD_ATOMICITY
+ //#undef PROCESS_SERVER_ENFORCE_VMA_MOD_ATOMICITY
+ //#define PROCESS_SERVER_USE_DISTRIBUTED_MM_LOCK
+ //#undef PROCESS_SERVER_USE_DISTRIBUTED_MM_LOCK
+ #define PROCESS_SERVER_USE_HEAVY_LOCK
+ //#undef PROCESS_SERVER_USE_HEAVY_LOCK
+ #if defined(PROCESS_SERVER_USE_DISTRIBUTED_MM_LOCK) && defined(PROCESS_SERVER_USE_HEAVY_LOCK)
+ #error cannot have both PROCESS_SERVER_USE_DISTRIBUTED_MM_LOCK and PROCESS_SERVER_USE_HEAVY_LOCK
+ #endif
  /*
   * Migration hook.
   */
@@@ -52,4 -76,13 +76,12 @@@ int process_server_dup_task(struct task
  unsigned long process_server_do_mmap_pgoff(struct file *file, unsigned long addr,
                                             unsigned long len, unsigned long prot,
                                             unsigned long flags, unsigned long pgoff);
 -
+ int process_server_acquire_page_lock(unsigned long address);
+ int process_server_acquire_page_lock_range(unsigned long address, size_t sz);
+ int process_server_acquire_heavy_lock(void);
+ int process_server_acquire_distributed_mm_lock(void);
+ void process_server_release_page_lock(unsigned long address);
+ void process_server_release_page_lock_range(unsigned long address, size_t sz);
+ void process_server_release_heavy_lock(void);
+ void process_server_release_distributed_mm_lock(void);
  #endif // _PROCESS_SERVER_H
index 07b2602,0000000..dc69802
mode 100644,000000..100644
--- /dev/null
@@@ -1,28 -1,0 +1,28 @@@
- extern struct list_head pfn_list_head;
 +#ifndef __POPCORN_INIT_H
 +#define __POPCORN_INIT_H
 +/*
 + * Boot parameters for allocating Kernel ID
 + *
 + * (C) Akshay Ravichandran <akshay87@vt.edu> 2012
 + */
 +
 +
 +extern unsigned int Kernel_Id;
 +extern unsigned long *token_bucket;
 +extern unsigned long long bucket_phys_addr;
 +extern unsigned long kernel_start_addr;
 +
 +extern void popcorn_init(void);
 +
 +extern int _init_RemoteCPUMask(void);
 +
 +
 +extern struct list_head rlist_head;
 +
- extern int _init_RemotePFN(void);
++//extern struct list_head pfn_list_head;
 +
++//extern int _init_RemotePFN(void);
 +
 +
 +#endif /* __POPCORN_INIT_H */
 +
  #include <linux/pcn_kmsg.h> // Messaging
  #include <linux/pcn_perf.h> // performance measurement
  #include <linux/string.h>
 +
 +#include <linux/popcorn_cpuinfo.h>
+ #include <linux/unistd.h>
+ #include <linux/tsacct_kern.h>
+ #include <linux/popcorn.h>
+ #include <linux/syscalls.h>
+ #include <linux/kernel.h>
+ #include <linux/proc_fs.h>
  
  #include <asm/pgtable.h>
  #include <asm/atomic.h>
@@@ -256,7 -256,14 +268,15 @@@ static void perf_init(void) 
  #define PERF_MEASURE_STOP(x, y, z)
  #endif
  
 +static DECLARE_WAIT_QUEUE_HEAD( countq);
+ /**
+  * Enums
+  */
+ typedef enum _lamport_barrier_state {
+     LAMPORT_ENTRY_OWNED,
+     LAMPORT_ENTRY_OFF_LIMITS,
+     LAMPORT_ENTRY_CONTENDED
+ } lamport_barrier_state_t;
  
  
  /**
@@@ -346,12 -353,8 +366,14 @@@ typedef struct _clone_data 
      unsigned short thread_ds;
      unsigned short thread_fsindex;
      unsigned short thread_gsindex;
 +#ifdef FPU_
 +    unsigned int  task_flags; //FPU, but should be extended t
 +    unsigned char task_fpu_counter;
 +    unsigned char thread_has_fpu;
 +    union thread_xstate fpu_state; //FPU migration
 +#endif
+     unsigned long def_flags;
+     unsigned int personality;
      int tgroup_home_cpu;
      int tgroup_home_id;
      int t_home_cpu;
@@@ -478,12 -508,8 +534,14 @@@ typedef struct _clone_request 
      unsigned short thread_ds;
      unsigned short thread_fsindex;
      unsigned short thread_gsindex;
 +#ifdef FPU_   
 +    unsigned int  task_flags; //FPU, but should be extended t
 +    unsigned char task_fpu_counter; 
 +    unsigned char thread_has_fpu;   
 +    union thread_xstate fpu_state; //FPU migration support
 +#endif
+     unsigned long def_flags;
+     unsigned int personality;
      int tgroup_home_cpu;
      int tgroup_home_id;
      int t_home_cpu;
@@@ -757,15 -790,128 +829,134 @@@ typedef struct _back_migration 
      unsigned short thread_es;
      unsigned short thread_ds;
      unsigned short thread_fsindex;
 -    unsigned short thread_gsindex;
 +    unsigned short thread_gsindex; 
 +#ifdef FPU_   
 +    unsigned int  task_flags; //FPU, but should be extended t
 +    unsigned char task_fpu_counter; 
 +    unsigned char thread_has_fpu;   
 +    union thread_xstate fpu_state; //FPU migration support
 +#endif
  } back_migration_t;
  
+ /**
+  *
+  */
+ struct _lamport_barrier_request{
+     struct pcn_kmsg_hdr header;
+     int tgroup_home_cpu;            // 4
+     int tgroup_home_id;             // 4
+     unsigned long address;          // 8
+     int is_heavy;                   // 4
+     unsigned long long timestamp;   // 16
+                                     // ---
+                                     // 36 -> 28 bytes of padding needed
+     char pad[28];
+ } __attribute__((packed)) __attribute__((aligned(64)));
+ typedef struct _lamport_barrier_request lamport_barrier_request_t;
+ /**
+  *
+  */
+ struct _lamport_barrier_request_range {
+     struct pcn_kmsg_hdr header;
+     int tgroup_home_cpu;            // 4
+     int tgroup_home_id;             // 4
+     unsigned long address;          // 8
+     int is_heavy;                    // 4
+     size_t sz;                      // 4
+     unsigned long long timestamp;   // 16
+                                     // ---
+                                     // 40 -> 20 bytes of padding needed
+     char pad[20];
+ } __attribute__((packed)) __attribute__((aligned(64)));
+ typedef struct _lamport_barrier_request_range lamport_barrier_request_range_t;
+ /**
+  *
+  */
+ struct _lamport_barrier_response {
+     struct pcn_kmsg_hdr header;
+     int tgroup_home_cpu;            // 4
+     int tgroup_home_id;             // 4
+     unsigned long address;          // 8
+     int is_heavy;         // 4
+     unsigned long long timestamp;   // 16
+                                     // ---
+                                     // 36 -> 24 bytes of padding needed
+     char pad[24];
+ } __attribute__((packed)) __attribute__((aligned(64)));
+ typedef struct _lamport_barrier_response lamport_barrier_response_t;
+ /**
+  *
+  */
+ struct _lamport_barrier_response_range {
+     struct pcn_kmsg_hdr header;
+     int tgroup_home_cpu;            // 4
+     int tgroup_home_id;             // 4
+     unsigned long address;          // 8
+     int is_heavy;                   // 4
+     size_t sz;                      // 4
+     unsigned long long timestamp;   // 16
+                                     // ---
+                                     // 40 -> 20 bytes of padding needed
+     char pad[20];
+ } __attribute__((packed)) __attribute__((aligned(64)));
+ typedef struct _lamport_barrier_response_range lamport_barrier_response_range_t;
+ /**
+  *
+  */
+ struct _lamport_barrier_release {
+     struct pcn_kmsg_hdr header;
+     int tgroup_home_cpu;            // 4
+     int tgroup_home_id;             // 4
+     unsigned long address;          // 8
+     int is_heavy;                    // 4
+     unsigned long long timestamp;   // 16
+                                     // ---
+                                     // 36 -> 24 bytes of padding needed
+     char pad[24];
+ } __attribute__((packed)) __attribute__((aligned(64)));
+ typedef struct _lamport_barrier_release lamport_barrier_release_t;
+ /**
+  *
+  */
+ struct _lamport_barrier_release_range {
+     struct pcn_kmsg_hdr header;
+     int tgroup_home_cpu;            // 4
+     int tgroup_home_id;             // 4
+     unsigned long address;          // 8
+     int is_heavy;                    // 4
+     size_t sz;                      // 4
+     unsigned long long timestamp;   // 16
+                                     // ---
+                                     // 40 -> 20 bytes of padding needed
+     char pad[20];
+ } __attribute__((packed)) __attribute__((aligned(64)));
+ typedef struct _lamport_barrier_release_range lamport_barrier_release_range_t;
+ /**
+  *
+  */
+ struct _get_counter_phys_request {
+     struct pcn_kmsg_hdr header;
+     char pad[60];
+ } __attribute__((packed)) __attribute__((aligned(64)));
+ typedef struct _get_counter_phys_request get_counter_phys_request_t;
+ /**
+  *
+  */
+ struct _get_counter_phys_response {
+     struct pcn_kmsg_hdr header;
+     unsigned long resp;
+     char pad[58];
+ } __attribute__((packed)) __attribute__((aligned(64)));
+ typedef struct _get_counter_phys_response get_counter_phys_response_t;
  /**
   *
   */
@@@ -795,14 -941,15 +986,21 @@@ typedef struct 
      unsigned short thread_ds;
      unsigned short thread_fsindex;
      unsigned short thread_gsindex;
 -} exit_work_t;
  
 +#ifdef FPU_   
 +    unsigned int  task_flags; //FPU, but should be extended t
 +    unsigned char task_fpu_counter; 
 +    unsigned char thread_has_fpu;   
 +    union thread_xstate fpu_state; //FPU migration support
 +#endif
 +} exit_work_t;
+ /**
+  *
+  */
+ typedef struct {
+     struct work_struct work;
+     clone_data_t* data;
+ } import_task_work_t;
  
  /**
   *
@@@ -937,15 -1085,88 +1136,92 @@@ typedef struct 
      unsigned short thread_ds;
      unsigned short thread_fsindex;
      unsigned short thread_gsindex;
- #ifdef FPU_   
-     unsigned int  task_flags; //FPU, but should be extended t
-     unsigned char task_fpu_counter; 
-     unsigned char thread_has_fpu;   
-     union thread_xstate fpu_state; //FPU migration support
- #endif
++#ifdef FPU_
++   unsigned int  task_flags; //FPU, but should be extended t
++   unsigned char task_fpu_counter;
++   unsigned char thread_has_fpu;
++   union thread_xstate fpu_state; // FPU migration support
  } back_migration_work_t;
  
 -
+ /**
+  *
+  */
+ typedef struct {
+     struct work_struct work;
+     int tgroup_home_cpu;
+     int tgroup_home_id;
+     int from_cpu;
+     unsigned long address;
+     int is_heavy;
+     unsigned long long timestamp;
+ } lamport_barrier_request_work_t;
+ /**
+  *
+  */
+ typedef struct {
+     struct work_struct work;
+     int tgroup_home_cpu;
+     int tgroup_home_id;
+     int from_cpu;
+     unsigned long address;
+     int is_heavy;
+     unsigned long long timestamp;
+ } lamport_barrier_response_work_t;
+ /**
+  * 
+  */
+ typedef struct {
+     struct work_struct work;
+     int tgroup_home_cpu;
+     int tgroup_home_id;
+     int from_cpu;
+     unsigned long address;
+     int is_heavy;
+     unsigned long long timestamp
+ } lamport_barrier_release_work_t;
+ /**
+  *
+  */
+ typedef struct {
+     struct work_struct work;
+     int tgroup_home_cpu;
+     int tgroup_home_id;
+     int from_cpu;
+     unsigned long address;
+     int is_heavy;
+     size_t sz;
+     unsigned long long timestamp;
+ } lamport_barrier_request_range_work_t;
+ /**
+  *
+  */
+ typedef struct {
+     struct work_struct work;
+     int tgroup_home_cpu;
+     int tgroup_home_id;
+     int from_cpu;
+     unsigned long address;
+     int is_heavy;
+     size_t sz;
+     unsigned long long timestamp;
+ } lamport_barrier_response_range_work_t;
+ /**
+  * 
+  */
+ typedef struct {
+     struct work_struct work;
+     int tgroup_home_cpu;
+     int tgroup_home_id;
+     int from_cpu;
+     unsigned long address;
+     int is_heavy;
+     size_t sz;
+     unsigned long long timestamp
+ } lamport_barrier_release_range_work_t;
  
  /**
   * Prototypes
@@@ -1122,19 -1448,22 +1503,22 @@@ static int is_mapped(struct mm_struct* 
  
      pgd = pgd_offset(mm, vaddr);                                                   
      if (pgd && !pgd_none(*pgd) && likely(!pgd_bad(*pgd)) && pgd_present(*pgd)) {
-       pud = pud_offset(pgd,vaddr);                                               
-       if (pud && !pud_none(*pud) && likely(!pud_bad(*pud)) && pud_present(*pud)) {
-       pmd = pmd_offset(pud,vaddr);
-         if(pmd && !pmd_none(*pmd) && likely(!pmd_bad(*pmd)) && pmd_present(*pmd)) {                      pte = pte_offset_map(pmd,vaddr);                                   
-         if(pte && !pte_none(*pte) && pte_present(*pte)) { 
+         pud = pud_offset(pgd,vaddr);                                               
+         if (pud && !pud_none(*pud) && likely(!pud_bad(*pud)) && pud_present(*pud)) {
+             pmd = pmd_offset(pud,vaddr);
+             if(pmd && !pmd_none(*pmd) && likely(!pmd_bad(*pmd)) && pmd_present(*pmd)) {             
+                 pte = pte_offset_map(pmd,vaddr);                                   
+                 if(pte && !pte_none(*pte) && pte_present(*pte)) { 
                     // It exists!                                                  
                      return 1;
 -                }                                                                  
 -            }                                                                      
 -        }                                                                          
 +          }                                                                  
 +        }                                                                      
 +      }                                                                          
      }
-     return 0;                                                                                  }
- */
+     return 0;
+ }
  
  /**
   * @brief Find the mm_struct for a given distributed thread.  
@@@ -3006,11 -3560,21 +3620,23 @@@ void process_mapping_request(struct wor
          .pte_entry = vm_search_page_walk_pte_entry_callback,
          .private = &(resolved)
      };
 +    char *plpath = NULL, *lpath = NULL;
 +    int used_saved_mm = 0, found_vma = 1, found_pte = 1; 
+     char* plpath = NULL;
+     char lpath[512];
      int i;
      
-     // for perf    
+     // for perf
+     int used_saved_mm = 0;
+     int found_vma = 1;
+     int found_pte = 1;
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+     unsigned long long mapping_response_send_time_start = 0;
+     unsigned long long mapping_response_send_time_end = 0;
+     unsigned long long mapping_request_processing_time_start = native_read_tsc();
+     unsigned long long mapping_request_processing_time_end = 0;
+ #endif
+     
      // Perf start
      int perf = PERF_MEASURE_START(&perf_process_mapping_request);
  
@@@ -3184,44 -3745,56 +3815,56 @@@ changed_can_be_cow
          found_vma = 0;
          found_pte = 0;
          //PSPRINTK("Mapping not found\n");
 -        response.header.type = PCN_KMSG_TYPE_PROC_SRV_MAPPING_RESPONSE;
 -        response.header.prio = PCN_KMSG_PRIO_NORMAL;
 -        response.tgroup_home_cpu = w->tgroup_home_cpu;
 -        response.tgroup_home_id = w->tgroup_home_id;
 -        response.requester_pid = w->requester_pid;
 -        response.address = address;
 -        response.present = 0;
 -        response.vaddr_start = 0;
 -        response.vaddr_size = 0;
 -        response.path[0] = '\0';
 +        response->header.type = PCN_KMSG_TYPE_PROC_SRV_MAPPING_RESPONSE;
 +        response->header.prio = PCN_KMSG_PRIO_NORMAL;
 +        response->tgroup_home_cpu = w->tgroup_home_cpu;
 +        response->tgroup_home_id = w->tgroup_home_id;
 +        response->requester_pid = w->requester_pid;
 +        response->address = address;
 +        response->present = 0;
 +        response->vaddr_start = 0;
 +        response->vaddr_size = 0;
 +        response->path[0] = '\0';
  
          // Handle case where vma was present but no pte.
-         if(vma) {
+         // Optimization, if no pte, and it is specified not to
+         // send the path, we can instead report that the mapping
+         // was not found at all.  This will result in sending a 
+         // nonpresent_mapping_response_t, which is much smaller
+         // than a mapping_response_t.
+         if(vma && w->need_vma) {
              //PSPRINTK("But vma present\n");
              found_vma = 1;
 -            response.present = 1;
 -            response.vaddr_start = vma->vm_start;
 -            response.vaddr_size = vma->vm_end - vma->vm_start;
 -            response.prot = vma->vm_page_prot;
 -            response.vm_flags = vma->vm_flags;
 +            response->present = 1;
 +            response->vaddr_start = vma->vm_start;
 +            response->vaddr_size = vma->vm_end - vma->vm_start;
 +            response->prot = vma->vm_page_prot;
 +            response->vm_flags = vma->vm_flags;
-              if(vma->vm_file == NULL) {
+              if(vma->vm_file == NULL || !w->need_vma) {
 -                 response.path[0] = '\0';
 +               response->path[0] = '\0';
               } else {    
                   plpath = d_path(&vma->vm_file->f_path,lpath,512);
 -                 strcpy(response.path,plpath);
 -                 response.pgoff = vma->vm_pgoff;
 +                 strcpy(response->path,plpath);
 +                 response->pgoff = vma->vm_pgoff;
               }
          }
      }
  
      // Send response
-     if(response->present) {
+     if(response.present) {
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+         mapping_response_send_time_start = native_read_tsc();
+         response.send_time = mapping_response_send_time_start;
+ #endif
          DO_UNTIL_SUCCESS(pcn_kmsg_send_long(w->from_cpu,
 -                            (struct pcn_kmsg_long_message*)(&response),
 +                            (struct pcn_kmsg_long_message*)(response),
                              sizeof(mapping_response_t) - 
                              sizeof(struct pcn_kmsg_hdr) -   //
 -                            sizeof(response.path) +         // Chop off the end of the path
 -                            strlen(response.path) + 1));    // variable to save bandwidth.
 +                            sizeof(response->path) +         // Chop off the end of the path
 +                            strlen(response->path) + 1));    // variable to save bandwidth.
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+         mapping_response_send_time_end = native_read_tsc();
+ #endif
      } else {
          // This is an optimization to get rid of the _long send 
          // which is a time sink.
          nonpresent_response.tgroup_home_id  = w->tgroup_home_id;
          nonpresent_response.requester_pid = w->requester_pid;
          nonpresent_response.address = w->address;
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+         mapping_response_send_time_start = native_read_tsc();
+         nonpresent_response.send_time = mapping_response_send_time_start;
+ #endif
          DO_UNTIL_SUCCESS(pcn_kmsg_send(w->from_cpu,(struct pcn_kmsg_message*)(&nonpresent_response)));
  
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+         mapping_response_send_time_end = native_read_tsc();
+ #endif
      }
      
 +    kfree(lpath);
 +err_response:
 +    kfree(response);
 +err_work:
+     // proc
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+     PS_PROC_DATA_TRACK(PS_PROC_DATA_MAPPING_RESPONSE_SEND_TIME,
+             mapping_response_send_time_end - mapping_response_send_time_start);
+ #endif
      kfree(work);
  
      // Perf stop
@@@ -3341,7 -3950,11 +4024,12 @@@ void process_group_exit_item(struct wor
      group_exit_work_t* w = (group_exit_work_t*) work;
      struct task_struct *task = NULL;
      struct task_struct *g;
 +    unsigned long flags;
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+     unsigned long long end_time;
+     unsigned long long total_time;
+     unsigned long long start_time = native_read_tsc();
+ #endif
  
      //int perf = PERF_MEASURE_START(&perf_process_group_exit_item);
      PSPRINTK("%s: entered\n",__func__);
@@@ -3506,10 -4128,19 +4216,20 @@@ void process_mprotect_item(struct work_
      data_header_t* curr = NULL;
      mm_data_t* mm_data = NULL;
      mm_data_t* to_munmap = NULL;
 -    struct mm_struct *mm_to_munmap = NULL;
 +    struct mm_structmm_to_munmap = NULL;
  
      int perf = PERF_MEASURE_START(&perf_process_mprotect_item);
 +    
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+     unsigned long long end_time;
+     unsigned long long total_time;
+     unsigned long long start_time = native_read_tsc();
+ #endif
+    
+     current->enable_distributed_munmap = 0;
+     current->enable_do_mmap_pgoff_hook = 0;
      // Find the task
      read_lock(&tasklist_lock);
      do_each_thread(g,task) {
@@@ -4518,14 -5753,8 +5871,16 @@@ static int handle_clone_request(struct 
      clone_data->thread_ds = request->thread_ds;
      clone_data->thread_fsindex = request->thread_fsindex;
      clone_data->thread_gsindex = request->thread_gsindex;
 +    //TODO this part of the code requires refactoring, it is ugly and can not be worst. Copy each element of a data structure in another without data transformation (ok in the het. case) is a waste of resources.
 +#ifdef FPU_   
 +         clone_data->task_flags = request->task_flags;
 +         clone_data->task_fpu_counter = request->task_fpu_counter;
 +         clone_data->thread_has_fpu = request->thread_has_fpu;
 +         clone_data->fpu_state = request->fpu_state;
 +     //end FPU code
 +#endif
+     clone_data->def_flags = request->def_flags;
+     clone_data->personality = request->personality;
      clone_data->vma_list = NULL;
      clone_data->tgroup_home_cpu = request->tgroup_home_cpu;
      clone_data->tgroup_home_id = request->tgroup_home_id;
      }
  
      spin_unlock_irqrestore(&_data_head_lock,lockflags);
 -#endif
  
 -    perf_dd = native_read_tsc();
 +    add_data_entry(clone_data);
 +
 +perf_dd = native_read_tsc();
  
      {
+ #ifdef PROCESS_SERVER_USE_KMOD
      struct subprocess_info* sub_info;
      char* argv[] = {clone_data->exe_path,NULL};
      static char *envp[] = { 
@@@ -4753,7 -6165,8 +6314,9 @@@ int process_server_import_address_space
      PSPRINTK("%s: t_home_id{%d}\n",__func__,current->t_home_id);
    
      if(!thread_mm) {
 +        
+        
+ #ifdef PROCESS_SERVER_USE_KMOD
          PS_DOWN_WRITE(&current->mm->mmap_sem);
  
          // Gut existing mappings
          flush_tlb_mm(current->mm);
          flush_cache_mm(current->mm);
          PS_UP_WRITE(&current->mm->mmap_sem);
 - 
 +        
          // import exe_file
          f = filp_open(clone_data->exe_path,O_RDONLY | O_LARGEFILE, 0);
-         if(f) {
+         if(!IS_ERR(f)) {
              get_file(f);
              current->mm->exe_file = f;
              filp_close(f,NULL);
 +        }
+         } else {
+             printk("%s: Error opening file %s\n",__func__,clone_data->exe_path);
+         }
+        
+ #else
+         struct mm_struct* mm = mm_alloc();
+         if(mm) {
+             init_new_context(current,mm);
+             // import exe_file
+             f = filp_open(clone_data->exe_path,O_RDONLY | O_LARGEFILE , 0);
+             if(!IS_ERR(f)) {
+                 //get_file(f);
+                 //mm->exe_file = f;
+                 set_mm_exe_file(mm,f);
+                 filp_close(f,NULL);
+             } else {
+                 printk("%s: Error opening executable file\n",__func__);
+             }
+             mm->task_size = TASK_SIZE;
+             mm->token_priority = 0;
+             mm->last_interval = 0;
+             arch_pick_mmap_layout(mm);
+             atomic_inc(&mm->mm_users);
+             exec_mmap(mm);
+         }
+ #endif
  
          perf_c = native_read_tsc();    
  
      current->thread.es = clone_data->thread_es;
      current->thread.ds = clone_data->thread_ds;
      current->thread.usersp = clone_data->thread_usersp;
+     current->thread.fsindex = clone_data->thread_fsindex;
+     current->thread.fs = clone_data->thread_fs;
+     current->thread.gs = clone_data->thread_gs;    
+     current->thread.gsindex = clone_data->thread_gsindex;
 +   
 +
 +    //mklinux_akshay
 +    current->origin_pid = clone_data->origin_pid;
 +    sigorsets(&current->blocked,&current->blocked,&clone_data->remote_blocked) ;
 +    sigorsets(&current->real_blocked,&current->real_blocked,&clone_data->remote_real_blocked);
 +    sigorsets(&current->saved_sigmask,&current->saved_sigmask,&clone_data->remote_saved_sigmask);
 +    current->pending = clone_data->remote_pending;
 +    current->sas_ss_sp = clone_data->sas_ss_sp;
 +    current->sas_ss_size = clone_data->sas_ss_size;
 +
 +    printk(KERN_ALERT "origin pid {%d}-{%d} \n",current->origin_pid,clone_data->origin_pid);
 +
 +    int cnt=0;
 +     for(cnt=0;cnt<_NSIG;cnt++)
 +       current->sighand->action[cnt] = clone_data->action[cnt];
  
      // Set output variables.
      *sp = clone_data->thread_usersp;
        load_gs_index(0);
      if (current->thread.gs)
        checking_wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
-                                                    
+ #else
+     {
+     int i, ch;
+     const char* name = NULL;
+     char tcomm[sizeof(current->comm)];
+     flush_thread();
+     set_fs(USER_DS);
+     current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
+     current->sas_ss_sp = current->sas_ss_size = 0;
+     // Copy exe name
+     name = clone_data->exe_path;
+     for(i = 0; (ch = *(name++)) != '\0';) {
+         if(ch == '/')
+             i = 0;
+         else if (i < (sizeof(tcomm) - 1)) 
+             tcomm[i++] = ch;
+     }
+     tcomm[i] = '\0';
+     set_task_comm(current,tcomm);
+     current->self_exec_id++;
+         
+     flush_signal_handlers(current,0);
+     flush_old_files(current->files);
+     }
+     start_remote_thread(regs);
+ #endif
      } // FS/GS update --- end
  
+     // Save off clone data, replacing any that may
 +#ifdef FPU_   
 +     //FPU migration code --- server
 +          /* PF_USED_MATH is set if the task used the FPU before
 +           * fpu_counter is incremented every time you go in __switch_to while owning the FPU
 +           * has_fpu is true if the task is the owner of the FPU, thus the FPU contains its data
 +           * fpu.preload (see arch/x86/include/asm.i387.h:switch_fpu_prepare()) is a heuristic
 +           */
 +          if (clone_data->task_flags & PF_USED_MATH)
 +              set_used_math();
 +          current->fpu_counter = clone_data->task_fpu_counter;
 +          if (clone_data->thread_has_fpu & HAS_FPU_MASK) {    
 +        if (fpu_alloc(&current->thread.fpu) == -ENOMEM)
 +                  printk(KERN_ALERT "%s: ERROR fpu_alloc returned -ENOMEM, remote fpu not copied.\n", __func__);
 +              else {
 +                  struct fpu temp; temp.state = &clone_data->fpu_state;
 +                  fpu_copy(&current->thread.fpu, &temp);
 +              }
 +         }
 +     printk(KERN_ALERT"%s: task flags %x fpu_counter %x has_fpu %x [%d:%d]\n",
 +         __func__, current->flags, (int)current->fpu_counter,
 +          (int)current->thread.has_fpu, (int)__thread_has_fpu(current), (int)fpu_allocated(&current->thread.fpu));
 +          //FPU migration code --- is the following optional?
 +          if (tsk_used_math(current) && current->fpu_counter >5) //fpu.preload
 +              __math_state_restore(current);
 +#endif    
 +     // Save off clone data, replacing any that may
      // already exist.
+ #ifdef PROCESS_SERVER_USE_KMOD
      if(current->clone_data) {
          unsigned long lockflags;
          spin_lock_irqsave(&_data_head_lock,lockflags);
@@@ -5299,13 -6793,13 +6994,13 @@@ finished_membership_search
          // the list does not include the current processor group descirptor (TODO)
          struct list_head *iter;
          _remote_cpu_info_list_t *objPtr;
-       struct cpumask *pcpum =0;
- extern struct list_head rlist_head;
+         struct cpumask *pcpum =0;
        extern struct list_head rlist_head;
          list_for_each(iter, &rlist_head) {
 -        objPtr = list_entry(iter, _remote_cpu_info_list_t, cpu_list_member);
 -        i = objPtr->_data._processor;
 -        pcpum  = &(objPtr->_data._cpumask);
 -        if ( bitmap_intersects(cpumask_bits(pcpum),  
 +          objPtr = list_entry(iter, _remote_cpu_info_list_t, cpu_list_member);
 +          i = objPtr->_data._processor;
 +          pcpum  = &(objPtr->_data._cpumask);
 +        if ( bitmap_intersects(cpumask_bits(pcpum),  
                                &(current->previous_cpus),
                                (sizeof(unsigned long) *8)) )
  #endif
                if(i == _cpu) continue;
  #else
            // the list does not include the current processor group descirptor (TODO)
-           struct list_head *iter;
-           _remote_cpu_info_list_t *objPtr;
- extern struct list_head rlist_head;
+               struct list_head *iter;
+               _remote_cpu_info_list_t *objPtr;
            extern struct list_head rlist_head;
              list_for_each(iter, &rlist_head) {
 -                objPtr = list_entry(iter, _remote_cpu_info_list_t, cpu_list_member);
 -                i = objPtr->_data._processor;
 +              objPtr = list_entry(iter, _remote_cpu_info_list_t, cpu_list_member);
 +              i = objPtr->_data._processor;
  #endif
                pcn_kmsg_send(i,(struct pcn_kmsg_message*)(&exit_notification));
              }
@@@ -5456,8 -6967,12 +7168,12 @@@ int process_server_do_munmap(struct mm_
       // Nothing to do for a thread group that's not distributed.
      if(!current->tgroup_distributed || !current->enable_distributed_munmap) {
          goto exit;
 -    }
 +    } 
  
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+     do_time_measurement = 1;
+ #endif
      perf = PERF_MEASURE_START(&perf_process_server_do_munmap);
  
      data = kmalloc(sizeof(munmap_request_data_t),GFP_KERNEL);
@@@ -5812,10 -7398,10 +7611,10 @@@ int process_server_pull_remote_mappings
          // Skip the current cpu
          if(i == _cpu) continue;
  #else
 -        // the list does not include the current processor group descirptor (TODO)
 +    // the list does not include the current processor group descirptor (TODO)
      struct list_head *iter;
      _remote_cpu_info_list_t *objPtr;
- extern struct list_head rlist_head;
    extern struct list_head rlist_head;
      list_for_each(iter, &rlist_head) { 
          objPtr = list_entry(iter, _remote_cpu_info_list_t, cpu_list_member);
          i = objPtr->_data._processor;
                if ( data->vm_flags & VM_NORESERVE )
                        printk(KERN_ALERT"MAPPING ANONYMOUS %p %p data: %lx vma: %lx {%lx-%lx} ret%lx\n",
                                __func__, data->mappings[i].vaddr, data->mappings[i].paddr, 
 -                              data->vm_flags, vma?vma->vm_flags:0, vma?vma->vm_start:0, vma?vma->vm_end:0, err);*/
 +                              data->vm_flags, vma?vma->vm_flags:0, vma?vma->vm_start:0, vma?vma->vm_end:0, err);
 +*/                current->enable_distributed_munmap = 1;
 +                current->enable_do_mmap_pgoff_hook = 1;
              } else {
+                 //unsigned char used_existing;
                  PSPRINTK("opening file to map\n");
                  is_anonymous = 0;
  
                  //PS_UP_WRITE(&current->mm->mmap_sem);
                  goto exit_remove_data;
              }
-             
+             PS_DOWN_READ(&current->mm->mmap_sem); 
              vma = find_vma_checked(current->mm, data->address); //data->vaddr_start);
+             PS_UP_READ(&current->mm->mmap_sem);
              if (data->address < vma->vm_start || vma->vm_end <= data->address)
 -                printk(KERN_ALERT"%s: ERROR %lx is not mapped in current vma {%lx-%lx} remote vma {%lx-%lx}\n",
 -                              __func__, data->address, vma->vm_start, vma->vm_end,
 -                              data->vaddr_start, (data->vaddr_start + data->vaddr_size));
 +              printk(KERN_ALERT"%s: ERROR %lx is not mapped in current vma {%lx-%lx} remote vma {%lx-%lx}\n",
 +                      __func__, data->address, vma->vm_start, vma->vm_end,
 +                      data->vaddr_start, (data->vaddr_start + data->vaddr_size));
          } else {
              PSPRINTK("vma is present, using existing\n");
          }
@@@ -6173,12 -7840,8 +8074,13 @@@ static int do_migration_to_new_cpu(stru
      // This will be a placeholder process for the remote
      // process that is subsequently going to be started.
      // Block its execution.
+     __set_task_state(task,TASK_UNINTERRUPTIBLE);
  
 +   // set_task_state(task,TASK_UNINTERRUPTIBLE); //mklinux_akshay modified to interruptible state
 +
 +
 +    int sig;
 +    struct task_struct *t=current;
      // Book keeping for previous cpu bitmask.
      set_bit(smp_processor_id(),&task->previous_cpus);
  
      task->represents_remote = 1;
      task->t_distributed = 1;
  
 +    /*mklinux_akshay*/
 +    if(task->prev_pid==-1)
 +      task->origin_pid=task->pid;
 +    else
 +      task->origin_pid=task->origin_pid;
 +
 +   struct task_struct *par = task->parent;
 +
 +
      // Book keeping for distributed threads.
      task->tgroup_distributed = 1;
+     read_lock(&tasklist_lock);
      do_each_thread(g,tgroup_iterator) {
          if(tgroup_iterator != task) {
              if(tgroup_iterator->tgid == task->tgid) {
      request->normal_prio = task->normal_prio;
      request->rt_priority = task->rt_priority;
      request->sched_class = task->policy;
+     request->personality = task->personality;
+     
 +
 +    /*mklinux_akshay*/
 +    if (task->prev_pid == -1)
 +      request->origin_pid = task->pid;
 +    else
 +      request->origin_pid = task->origin_pid;
 +    request->remote_blocked = task->blocked;
 +    request->remote_real_blocked = task->real_blocked;
 +    request->remote_saved_sigmask = task->saved_sigmask;
 +    request->remote_pending = task->pending;
 +    request->sas_ss_sp = task->sas_ss_sp;
 +    request->sas_ss_size = task->sas_ss_size;
 +    int cnt = 0;
 +    for (cnt = 0; cnt < _NSIG; cnt++)
 +      request->action[cnt] = task->sighand->action[cnt];
 +
      // struct thread_struct -------------------------------------------------------
      // have a look at: copy_thread() arch/x86/kernel/process_64.c 
      // have a look at: struct thread_struct arch/x86/include/asm/processor.h
      {
 -      unsigned long fs, gs;
 +              unsigned long fs, gs;
        unsigned int fsindex, gsindex;
        unsigned int ds, es;
-       
-           if (current != task)
-             PSPRINTK("DAVEK current is different from task!\n");
+     unsigned long _usersp;
+       if (current != task)
+           PSPRINTK("DAVEK current is different from task!\n");
  
      request->thread_sp0 = task->thread.sp0;
      request->thread_sp = task->thread.sp;
@@@ -6435,9 -8042,9 +8342,10 @@@ PSPRINTK(KERN_ERR"%s: task flags %x fpu
   * <MEASURE perf_process_server_do_migration>
   */
  static int do_migration_back_to_previous_cpu(struct task_struct* task, int cpu) {
 -    back_migration_t mig;
 +    back_migration_t *mig =NULL;
      struct pt_regs* regs = task_pt_regs(task);
 +
+     unsigned long _usersp;
      int perf = -1;
  
      perf = PERF_MEASURE_START(&perf_process_server_do_migration);
@@@ -6611,18 -8190,974 +8519,977 @@@ void process_server_do_return_dispositi
  }
  
  /**
-  * @brief Initialize this module
+  *
   */
- static int __init process_server_init(void) {
+ void wait_for_all_lamport_lock_acquisition(lamport_barrier_queue_t* queue,
+                                            lamport_barrier_entry_t* entry) {
+     data_header_t* curr = NULL;
+     lamport_barrier_queue_t* queue_curr = NULL;
+     int done = 0;
+     PSPRINTK("%s: ts{%lx}\n",__func__,entry->timestamp);
+     PSPRINTK("%s: Starting queues-\n",__func__);
+     dump_all_lamport_queues();
+     while(!done) {
+         done = 1;
+         PS_SPIN_LOCK(&_lamport_barrier_queue_lock);
+         // look through every queue for this thread group
+         curr = (data_header_t*)_lamport_barrier_queue_head;
+         while(curr) {
+             queue_curr = (lamport_barrier_queue_t*) curr;
+             if(queue_curr->tgroup_home_cpu == queue->tgroup_home_cpu &&
+                queue_curr->tgroup_home_id  == queue->tgroup_home_id) {
+                 
+                 // if we don't have the lock, spin again.
+                 if(queue_curr->queue) {
+                     if(queue_curr->queue->timestamp != entry->timestamp) {
+                         done = 0;
+                         break;
+                     }
+                 }
  
-     /*
-      * Cache some local information.
-      */
- //#ifndef SUPPORT_FOR_CLUSTERING
-            _cpu= smp_processor_id();
+             }
+             curr = curr->next;
+         }
+         PS_SPIN_UNLOCK(&_lamport_barrier_queue_lock);
+         if(!done)
+             schedule();
+     }
+     PSPRINTK("%s: Ending queues-\n",__func__);
+     dump_all_lamport_queues();
+     PSPRINTK("%s: exiting ts{%llx}\n",__func__,entry->timestamp);
+ }
+ /**
+  * _lamport_barrier_queue_lock must NOT already be held.
+  */
+ void wait_for_lamport_lock_acquisition(lamport_barrier_queue_t* queue,
+                                        lamport_barrier_entry_t* entry) {
+     // Wait until "entry" is at the front of the queue
+     PSPRINTK("%s: ts{%llx}\n",__func__,entry->timestamp);
+     while(1) {
+         PS_SPIN_LOCK(&_lamport_barrier_queue_lock);
+         if(entry == queue->queue) {
+             queue->active_timestamp = entry->timestamp;
+             PS_SPIN_UNLOCK(&_lamport_barrier_queue_lock);
+             goto lock_acquired;
+         }
+         PS_SPIN_UNLOCK(&_lamport_barrier_queue_lock);
+         schedule();
+     } 
+ lock_acquired:
+     if(queue->is_heavy) {
+         wait_for_all_lamport_lock_acquisition(queue,entry);
+     }
+     PSPRINTK("%s: exiting ts{%llx}\n",__func__,entry->timestamp);
+     return;
+ }
+ /**
+  * _lamport_barrier_queue_lock must NOT already be held.
+  */
+ void wait_for_all_lamport_request_responses(lamport_barrier_entry_t* entry) {
+     PSPRINTK("%s: ts{%llx}\n",__func__,entry->timestamp);
+     while(1) {
+         PS_SPIN_LOCK(&_lamport_barrier_queue_lock);
+         if(entry->expected_responses == entry->responses) {
+             PS_SPIN_UNLOCK(&_lamport_barrier_queue_lock);
+             goto responses_acquired;
+         }
+         PS_SPIN_UNLOCK(&_lamport_barrier_queue_lock);
+         schedule();
+     }
+ responses_acquired:
+     PSPRINTK("%s: exiting ts{%llx}\n",__func__,entry->timestamp);
+     return;
+ }
+ /**
+  * 
+  */
+ void add_entry_to_lamport_queue_light(unsigned long address, 
+                                       unsigned long long ts,
+                                       lamport_barrier_entry_t** entry,
+                                       lamport_barrier_queue_t** queue) {
+     lamport_barrier_queue_t* heavy_queue = NULL;
+     PSPRINTK("%s: addr{%lx},ts{%llx}\n",__func__,address,ts);
+     *entry = kmalloc(sizeof(lamport_barrier_entry_t),GFP_ATOMIC);
+     // form record and place in queue
+     (*entry)->timestamp = ts;
+     (*entry)->responses = 0;
+     (*entry)->expected_responses = 0;
+     (*entry)->allow_responses = 0;
+     (*entry)->is_heavy = 0;
+     (*entry)->cpu = _cpu;
+     // find queue if it exists
+     *queue = find_lamport_barrier_queue(current->tgroup_home_cpu,
+                                      current->tgroup_home_id,
+                                      address,
+                                      0);
+     // If no queue exists, create one
+     if(!*queue) {
+         *queue = kmalloc(sizeof(lamport_barrier_queue_t),GFP_ATOMIC);
+         (*queue)->tgroup_home_cpu = current->tgroup_home_cpu;
+         (*queue)->tgroup_home_id  = current->tgroup_home_id;
+         (*queue)->address = address;
+         (*queue)->is_heavy = 0;
+         PSPRINTK("%s: Setting active_timestamp to 0\n",__func__);
+         (*queue)->active_timestamp = 0;
+         (*queue)->queue = NULL;
+         add_data_entry_to(*queue,NULL,&_lamport_barrier_queue_head);
+         // Add all heavy entries to this queue
+         heavy_queue = find_lamport_barrier_queue(current->tgroup_home_cpu,
+                                                  current->tgroup_home_id,
+                                                  0,
+                                                  1);
+         if(heavy_queue) {
+             lamport_barrier_entry_t* curr = heavy_queue->queue;
+             PSPRINTK("%s: found heavy queue\n",__func__);
+             while(curr) {
+                 lamport_barrier_entry_t* e = kmalloc(sizeof(lamport_barrier_entry_t),GFP_ATOMIC);
+                 PSPRINTK("%s: adding entry from heavy queue to queue(addr{%lx}) ts{%llx}\n",
+                         __func__,address,curr->timestamp);
+                 e->timestamp = curr->timestamp;
+                 e->responses = 0;
+                 e->expected_responses = 0;
+                 e->allow_responses = 0;
+                 e->is_heavy = 1;
+                 e->cpu = curr->cpu;
+                 
+                 add_fault_entry_to_queue(e,*queue);
+                 if((*queue)->queue == e) {
+                     PSPRINTK("%s: new entry is not at the front of the queue\n",
+                             __func__);
+                     PSPRINTK("%s: setting active timestamp to %llx\n",
+                             __func__,e->timestamp);
+                     (*queue)->active_timestamp = e->timestamp;
+                 }
+                
+                 curr = (lamport_barrier_entry_t*)curr->header.next;
+             }
+         }
+     } 
+     // Add entry to queue
+     add_fault_entry_to_queue(*entry,*queue);
+     dump_lamport_queue(*queue);
+ }
+ static void add_entry_to_lamport_queue_heavy(unsigned long long ts,
+                                       lamport_barrier_entry_t** entry,
+                                       lamport_barrier_queue_t** queue) {
+     data_header_t* curr = NULL;
+     lamport_barrier_queue_t* queue_curr = NULL;
+     PSPRINTK("%s: ts{%llx}\n",__func__,ts);
+     *entry = kmalloc(sizeof(lamport_barrier_entry_t),GFP_ATOMIC);
+     // form record and place in queue
+     (*entry)->timestamp = ts;
+     (*entry)->responses = 0;
+     (*entry)->expected_responses = 0;
+     (*entry)->allow_responses = 0;
+     (*entry)->is_heavy = 1;
+     (*entry)->cpu = _cpu;
+     // find queue if it exists
+     *queue = find_lamport_barrier_queue(current->tgroup_home_cpu,
+                                         current->tgroup_home_id,
+                                         0,
+                                         1);
+     // If no queue exists, create one
+     if(!*queue) {
+         PSPRINTK("%s: adding heavy queue\n",__func__);
+         *queue = kmalloc(sizeof(lamport_barrier_queue_t),GFP_ATOMIC);
+         (*queue)->tgroup_home_cpu = current->tgroup_home_cpu;
+         (*queue)->tgroup_home_id  = current->tgroup_home_id;
+         (*queue)->address = 0;
+         (*queue)->is_heavy = 1;
+         PSPRINTK("%s: Setting active_timestamp to 0\n",__func__);
+         (*queue)->active_timestamp = 0;
+         (*queue)->queue = NULL;
+         add_data_entry_to(*queue,NULL,&_lamport_barrier_queue_head);
+     } 
+     // Add entry to queue
+     add_fault_entry_to_queue(*entry,*queue);
+     // Add entry to all existing non-heavy queues for this thread group
+     curr = (data_header_t*)_lamport_barrier_queue_head; 
+     while(curr) {
+         queue_curr = (lamport_barrier_queue_t*) curr;
+         if(queue_curr->tgroup_home_cpu == current->tgroup_home_cpu &&
+            queue_curr->tgroup_home_id  == current->tgroup_home_id) {
+             if(!queue_curr->is_heavy) {
+                 lamport_barrier_entry_t* e = kmalloc(sizeof(lamport_barrier_entry_t),GFP_ATOMIC);
+                 PSPRINTK("%s: adding entry to non heavy queue addr{%lx}\n",
+                         __func__,queue_curr->address);
+                 e->timestamp = ts;
+                 e->responses = 0;
+                 e->expected_responses = 0;
+                 e->allow_responses = 0;
+                 e->is_heavy = 1;
+                 e->cpu = _cpu;
+                 add_fault_entry_to_queue(e,queue_curr);
+                 if(queue_curr->queue == e) {
+                     PSPRINTK("%s: new entry is not at the front of the queue\n",
+                             __func__);
+                     PSPRINTK("%s: setting active timestamp to %llx\n",
+                             __func__,e->timestamp);
+                     queue_curr->active_timestamp = e->timestamp;
+                 }
+                 PSPRINTK("Modified non heavy queue-\n");
+                 dump_lamport_queue(queue_curr);
+             }
+             
+         }
+         curr = curr->next;
+     }
+     PSPRINTK("%s: exiting\n",__func__);
+ }
+ /**
+  * _lamport_barrier_queue_lock must already be held.
+  */
+ static void add_entry_to_lamport_queue(unsigned long address, 
+                                 unsigned long long ts,
+                                 lamport_barrier_entry_t** entry,
+                                 lamport_barrier_queue_t** queue,
+                                 int is_heavy) {
+     if(is_heavy) {
+         add_entry_to_lamport_queue_heavy(ts,entry,queue);
+     } else {
+         add_entry_to_lamport_queue_light(address,ts,entry,queue);
+     }
+ }
+ /**
+  *
+  */
+ static int process_server_acquire_page_lock_range_maybeheavy(unsigned long address,size_t sz, int is_heavy) {
+     lamport_barrier_request_range_t* request = NULL;
+     lamport_barrier_entry_t** entry_list = NULL;
+     lamport_barrier_queue_t** queue_list = NULL;
+     int i,s;
+     unsigned long addr;
+     int index;
+     int page_count = sz / PAGE_SIZE;
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+     unsigned long long end_time = 0;
+     unsigned long long total_time = 0;
+     unsigned long long start_time = native_read_tsc();
+ #endif
+     if(!current->tgroup_distributed) return 0;
+     PSPRINTK("%s: addr{%lx},sz{%d},is_heavy{%d}\n",__func__,address,sz,is_heavy);
+     BUG_ON(is_heavy && (sz > PAGE_SIZE));
+     entry_list = kmalloc(sizeof(lamport_barrier_entry_t*)*page_count,GFP_KERNEL);
+     queue_list = kmalloc(sizeof(lamport_barrier_queue_t*)*page_count,GFP_KERNEL);
+     request = kmalloc(sizeof(lamport_barrier_request_range_t), GFP_KERNEL);
+   
+     BUG_ON(!request);
+     BUG_ON(!entry_list);
+     BUG_ON(!queue_list);
+     address &= PAGE_MASK;
+     request->header.type = PCN_KMSG_TYPE_PROC_SRV_LAMPORT_BARRIER_REQUEST_RANGE;
+     request->header.prio = PCN_KMSG_PRIO_NORMAL;
+     request->address = address;
+     request->is_heavy = is_heavy;
+     request->sz = sz;
+     request->tgroup_home_cpu = current->tgroup_home_cpu;
+     request->tgroup_home_id =  current->tgroup_home_id;
+     // Grab the fault barrier queue lock
+     PS_SPIN_LOCK(&_lamport_barrier_queue_lock);
+     
+     // create timestamp
+     request->timestamp = get_next_ts_value(); /*native_read_tsc();*/
+     if(!is_heavy) {
+         index = 0;
+         for(addr = address; addr < address + sz; addr += PAGE_SIZE) {
+             add_entry_to_lamport_queue(addr,
+                                        request->timestamp,
+                                        &(entry_list[index]),
+                                        &(queue_list[index]),
+                                        0);
+             index++;
+         }
+     } else {
+         add_entry_to_lamport_queue(0,
+                                    request->timestamp,
+                                    &(entry_list[0]),
+                                    &(queue_list[0]),
+                                    1);
+     }
+     PS_SPIN_UNLOCK(&_lamport_barrier_queue_lock);
+     // Send out request to everybody
+     for(i = 0; i < NR_CPUS; i++) {
+         if(i == _cpu) continue;
+         s = pcn_kmsg_send(i,(struct pcn_kmsg_message*)request);
+         if(!s) {
+             for(index = 0; index < page_count; index++) 
+                 entry_list[index]->expected_responses++;
+         }
+     }
+     mb();
+     kfree(request);
+     for(index = 0; index < page_count; index++)
+         wait_for_all_lamport_request_responses(entry_list[index]);
+     for(index = 0; index < page_count; index++)
+         wait_for_lamport_lock_acquisition(queue_list[index],entry_list[index]);
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+     end_time = native_read_tsc();
+     for(index = 0; index < page_count; index++)
+         entry_list[index]->lock_acquired = end_time;
+     total_time = end_time - start_time;
+     PS_PROC_DATA_TRACK(PS_PROC_DATA_WAITING_FOR_LAMPORT_LOCK,total_time);
+ #endif
+     kfree(entry_list);
+     kfree(queue_list);
+     PSPRINTK("%s: exiting\n",__func__);
+     return 0;
+ }
+ int process_server_acquire_page_lock_range(unsigned long address,size_t sz) {
+     return process_server_acquire_page_lock_range_maybeheavy(address,sz,0);
+ }
+ /**
+  *
+  */
+ int process_server_acquire_page_lock(unsigned long address) {
+     return process_server_acquire_page_lock_range(address,PAGE_SIZE);
+ }
+ /**
+  *
+  */
+ int process_server_acquire_heavy_lock() {
+     return process_server_acquire_page_lock_range_maybeheavy(0,PAGE_SIZE,1);
+ }
+ /**
+  *
+  */
+ int process_server_acquire_distributed_mm_lock() {
+     return process_server_acquire_page_lock_range(0,PAGE_SIZE);
+ }
+ /**
+  *
+  */
+ static void release_local_lamport_lock_light(unsigned long address,
+                                       unsigned long long* timestamp) {
+     lamport_barrier_queue_t* queue = NULL;
+     lamport_barrier_entry_t* entry = NULL;
+     *timestamp = 0;
+     // find queue
+     queue = find_lamport_barrier_queue(current->tgroup_home_cpu,
+                                      current->tgroup_home_id,
+                                      address,
+                                      0);
+     //BUG_ON(!queue);
+     if(queue) {
+         BUG_ON(!queue->queue);
+         BUG_ON(queue->queue->cpu != _cpu);
+         
+         entry = queue->queue;
+         
+         //BUG_ON(entry->timestamp != queue->active_timestamp);
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+         entry->lock_released = native_read_tsc();
+         PS_PROC_DATA_TRACK(PS_PROC_DATA_LAMPORT_LOCK_HELD,
+                                 entry->lock_released - entry->lock_acquired);
+ #endif
+         *timestamp = entry->timestamp;
+         PSPRINTK("%s: Setting active_timestamp to 0\n",__func__);
+         queue->active_timestamp = 0;
+         
+         // remove entry from queue
+         remove_data_entry_from((data_header_t*)entry,(data_header_t**)&queue->queue);
+         kfree(entry); // this is OK, because kfree never sleeps
+         // garbage collect the queue if necessary
+         if(!queue->queue) {
+             remove_data_entry_from(queue,&_lamport_barrier_queue_head);
+             kfree(queue);
+         }
+     
+     }
+ }
+ /**
+  *
+  */
+ static void release_local_lamport_lock_heavy(unsigned long long* timestamp) {
+     data_header_t* curr = _lamport_barrier_queue_head;
+     data_header_t* next = NULL;
+     PSPRINTK("%s\n",__func__);
+     while(curr) {
+         lamport_barrier_queue_t* queue = (lamport_barrier_queue_t*)curr;
+         lamport_barrier_entry_t* entry = NULL;
+         next = curr->next;
+         
+         if(queue->tgroup_home_cpu != current->tgroup_home_cpu ||
+            queue->tgroup_home_id  != current->tgroup_home_id) {
+             curr = next;
+             continue;
+         }
+         BUG_ON(!queue->queue);
+         
+         entry = queue->queue;
+         BUG_ON(!entry);
+         BUG_ON(!entry->is_heavy);
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+         entry->lock_released = native_read_tsc();
+         PS_PROC_DATA_TRACK(PS_PROC_DATA_LAMPORT_LOCK_HELD,
+                                 entry->lock_released - entry->lock_acquired);
+ #endif
+         *timestamp = entry->timestamp;
+         PSPRINTK("%s: Setting active timestamp to 0\n",__func__);
+         queue->active_timestamp = 0;
+       
+         PSPRINTK("%s: Removing heavy entry ts{%llx},cpu{%d},heavy{%d},addr{%ls}\n",
+                 __func__,
+                 entry->timestamp,
+                 entry->cpu,
+                 entry->is_heavy,
+                 queue->address);
+         // remove entry from queue
+         remove_data_entry_from((data_header_t*)entry,(data_header_t**)&queue->queue);
+         kfree(entry); // this is OK, because kfree never sleeps
+         // garbage collect the queue if necessary
+         if(!queue->queue) {
+             PSPRINTK("%s: Removing queue is_heavy{%d}\n",__func__,queue->is_heavy);
+             remove_data_entry_from(queue,&_lamport_barrier_queue_head);
+             kfree(queue);
+         }
+         
+         curr = next;
+     }
+ }
+ /**
+  *
+  */
+ static void release_local_lamport_lock(unsigned long address,
+                                 unsigned long long* timestamp,
+                                 int is_heavy) {
+     if(0 != is_heavy) {
+         release_local_lamport_lock_heavy(timestamp);
+     } else {
+         release_local_lamport_lock_light(address,timestamp);
+     }
+ }
+ /**
+  *
+  */
+ void process_server_release_page_lock_range_maybeheavy(unsigned long address,size_t sz, int is_heavy) {
+     lamport_barrier_release_range_t* release = NULL;
+     int i;
+     int index;
+     unsigned long long timestamp = 0;
+     unsigned long long tmp_ts = 0;
+     int page_count = sz / PAGE_SIZE;
+     BUG_ON(is_heavy && (sz > PAGE_SIZE));
+     if(!current->tgroup_distributed) return;
+     PSPRINTK("%s: addr{%lx},sz{%d},is_heavy{%d}\n",__func__,address,sz,is_heavy);
+     address &= PAGE_MASK;
+     release = kmalloc(sizeof(lamport_barrier_release_range_t),
+                         GFP_KERNEL);
+     PS_SPIN_LOCK(&_lamport_barrier_queue_lock);
+     
+     if(is_heavy) {
+         release_local_lamport_lock(0,&tmp_ts,1);
+         if(!timestamp && tmp_ts) timestamp = tmp_ts;
+     } else {
+         for(index = 0; index < page_count; index++) {
+             release_local_lamport_lock(address + (index*PAGE_SIZE),
+                                        &tmp_ts,
+                                        0);
+             if(!timestamp && tmp_ts) timestamp = tmp_ts;
+         }
+     }
+     PS_SPIN_UNLOCK(&_lamport_barrier_queue_lock);
+     // Send release
+     release->header.type = PCN_KMSG_TYPE_PROC_SRV_LAMPORT_BARRIER_RELEASE_RANGE;
+     release->header.prio = PCN_KMSG_PRIO_NORMAL;
+     release->tgroup_home_cpu = current->tgroup_home_cpu;
+     release->tgroup_home_id  = current->tgroup_home_id;
+     release->is_heavy = is_heavy;
+     release->timestamp = timestamp;
+     release->address = address;
+     release->sz = sz;
+     for(i = 0; i < NR_CPUS; i++) {
+         if(i == _cpu) continue;
+         pcn_kmsg_send(i,(struct pcn_kmsg_message*)release);
+     }
+     kfree(release);
+     PSPRINTK("%s: exiting\n",__func__);
+ }
+ /**
+  *
+  */
+ void process_server_release_page_lock_range(unsigned long address,size_t sz) {
+     process_server_release_page_lock_range_maybeheavy(address,sz,0);
+ }
+ /**
+  *
+  */
+ void process_server_release_page_lock(unsigned long address) {
+     process_server_release_page_lock_range(address,PAGE_SIZE);
+ }
+ /**
+  *
+  */
+ void process_server_release_heavy_lock() {
+     process_server_release_page_lock_range_maybeheavy(0,PAGE_SIZE,1);
+ }
+ /**
+  *  
+  */
+ void process_server_release_distributed_mm_lock() {
+     process_server_release_page_lock_range(0,PAGE_SIZE);
+ }
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+ static void proc_data_reset(int cpu,int entry) {
+     if(entry >= PS_PROC_DATA_MAX) {
+         printk("Invalid proc_data_reset entry %d\n",entry);
+         return;
+     }
+     _proc_data[cpu][entry].total = 0;
+     _proc_data[cpu][entry].count = 0;
+     _proc_data[cpu][entry].min = 0;
+     _proc_data[cpu][entry].max = 0;
+    
+ }
+ #endif
+ /**
+  *
+  */
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+ static int proc_read(char* buf, char**start, off_t off, int count,
+                         int *eof, void*d) {
+     char* p = buf;
+     int i,j,s;
+     stats_query_t query;
+     stats_query_data_t data;
+     sprintf(buf,"See dmesg\n");
+     query.header.prio = PCN_KMSG_PRIO_NORMAL;
+     query.header.type = PCN_KMSG_TYPE_PROC_SRV_STATS_QUERY;
+     query.pid = current->pid;
+     data.pid = current->pid;
+     data.header.data_type = PROCESS_SERVER_STATS_DATA_TYPE;
+     data.expected_responses = 0;
+     data.responses = 0;
+     add_data_entry(&data);
+     // Update all the data
+ #ifndef SUPPORT_FOR_CLUSTERING
+     for(i = 0; i < NR_CPUS; i++) {
+         if(i == _cpu) continue;
+ #else
+     // the list does not include the current processor group descirptor (TODO)
+     struct list_head *iter;
+     _remote_cpu_info_list_t *objPtr;
+     extern struct list_head rlist_head;
+     list_for_each(iter, &rlist_head) {
+         objPtr = list_entry(iter, _remote_cpu_info_list_t, cpu_list_member);
+         i = objPtr->_data._processor;
+ #endif
+         s = pcn_kmsg_send(i,(struct pcn_kmsg_message*)(&query));
+         if(!s) {
+             data.expected_responses++;
+         }
+     }
+     while(data.expected_responses != data.responses) {
+         schedule();
+     }
+     spin_lock(&_data_head_lock);
+     remove_data_entry(&data);
+     spin_unlock(&_data_head_lock);
+     printk("Process Server Data\n");
+     for(i = 0; i < PS_PROC_DATA_MAX; i++) {
+         printk("%s[Tot,Cnt,Max,Min,Avg]:\n",_proc_data[_cpu][i].name);
+         for(j = 0; j < NR_CPUS; j++) {
+             if(_proc_data[j][i].count) {
+                 unsigned long long avg = 0;
+                 if(_proc_data[j][i].count)
+                     avg = _proc_data[j][i].total / _proc_data[j][i].count;
+                 printk("\tcpu{%d}[%llx,%d,%llx,%llx,%llx]\n",
+                                 j,
+                                 _proc_data[j][i].total,
+                                 _proc_data[j][i].count,
+                                 _proc_data[j][i].max,
+                                 _proc_data[j][i].min,
+                                 avg);
+             }
+         }
+     }
+     return strlen(buf);
+ }           
+ #endif
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+ static void proc_track_data(int entry, unsigned long long time) {
+     if(entry >= PS_PROC_DATA_MAX) {
+         printk("Invalid proc_track_data entry %d\n",entry);
+         return;
+     }
+     _proc_data[_cpu][entry].total += time;
+     _proc_data[_cpu][entry].count++;
+     if(_proc_data[_cpu][entry].min == 0 || time < _proc_data[_cpu][entry].min)
+         _proc_data[_cpu][entry].min = time;
+     if(time > _proc_data[_cpu][entry].max)
+         _proc_data[_cpu][entry].max = time;
+ }
+ #endif
+ /**
+  *
+  */      
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+ static int proc_write(struct file* file,
+                         const char* buffer,
+                         unsigned long count,
+                         void* data) {
+     int i;
+     int j;
+     stats_clear_t msg;
+     msg.header.type = PCN_KMSG_TYPE_PROC_SRV_STATS_CLEAR;
+     msg.header.prio = PCN_KMSG_PRIO_NORMAL;
+     for(j = 0; j < NR_CPUS; j++)
+         for(i = 0; i < PS_PROC_DATA_MAX; i++)
+             proc_data_reset(j,i);
+ #ifndef SUPPORT_FOR_CLUSTERING
+     for(i = 0; i < NR_CPUS; i++) {
+         if(i == _cpu) continue;
+ #else
+     // the list does not include the current processor group descirptor (TODO)
+     struct list_head *iter;
+     _remote_cpu_info_list_t *objPtr;
+     extern struct list_head rlist_head;
+     list_for_each(iter, &rlist_head) {
+         objPtr = list_entry(iter, _remote_cpu_info_list_t, cpu_list_member);
+         i = objPtr->_data._processor;
+ #endif
+         pcn_kmsg_send(i,(struct pcn_kmsg_message*)&msg);
+     }
+     return count;
+ } 
+ #endif
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+ static void proc_data_init(void) {
+     int i;
+     int j;
+     _proc_entry = create_proc_entry("procsrv",666,NULL);
+     _proc_entry->read_proc = proc_read;
+     _proc_entry->write_proc = proc_write;
+     for(j = 0; j < NR_CPUS; j++)
+         for(i = 0; i < PS_PROC_DATA_MAX; i++)
+             proc_data_reset(j,i);
+     for(j = 0; j < NR_CPUS; j++) {
+         sprintf(_proc_data[j][PS_PROC_DATA_MAPPING_WAIT_TIME].name,
+                 "Mapping wait time");
+         sprintf(_proc_data[j][PS_PROC_DATA_MAPPING_POST_WAIT_TIME_RESUME].name,
+                 "Time after all mapping responses are in and when the fault handler resumes");
+         sprintf(_proc_data[j][PS_PROC_DATA_MAPPING_REQUEST_SEND_TIME].name,
+                 "Mapping request send time");
+         sprintf(_proc_data[j][PS_PROC_DATA_MAPPING_RESPONSE_SEND_TIME].name,
+                 "Mapping response send time");
+         sprintf(_proc_data[j][PS_PROC_DATA_MAPPING_REQUEST_DELIVERY_TIME].name,
+                 "Mapping request delivery time");
+         sprintf(_proc_data[j][PS_PROC_DATA_MAPPING_RESPONSE_DELIVERY_TIME].name,
+                 "Mapping response delivery time");
+         sprintf(_proc_data[j][PS_PROC_DATA_BREAK_COW_TIME].name,
+                 "Break cow time");
+         sprintf(_proc_data[j][PS_PROC_DATA_MAPPING_REQUEST_PROCESSING_TIME].name,
+                 "Mapping request processing time");
+         sprintf(_proc_data[j][PS_PROC_DATA_FAULT_PROCESSING_TIME].name,
+                 "Fault processing time");
+         sprintf(_proc_data[j][PS_PROC_DATA_ADJUSTED_PERMISSIONS].name,
+                 "Adjusted permissions fault time");
+         sprintf(_proc_data[j][PS_PROC_DATA_NEWVMA_ANONYMOUS_PTE].name,
+                 "Newvma anonymous pte fault time");
+         sprintf(_proc_data[j][PS_PROC_DATA_NEWVMA_ANONYMOUS_NOPTE].name,
+                 "Newvma anonymous nopte fault time");
+         sprintf(_proc_data[j][PS_PROC_DATA_NEWVMA_FILEBACKED_PTE].name,
+                 "Newvma filebacked pte fault time");
+         sprintf(_proc_data[j][PS_PROC_DATA_NEWVMA_FILEBACKED_NOPTE].name,
+                 "Newvma filebacked nopte fault time");
+         sprintf(_proc_data[j][PS_PROC_DATA_OLDVMA_ANONYMOUS_PTE].name,
+                 "Oldvma anonymous pte fault time");
+         sprintf(_proc_data[j][PS_PROC_DATA_OLDVMA_ANONYMOUS_NOPTE].name,
+                 "Oldvma anonymous nopte fault time");
+         sprintf(_proc_data[j][PS_PROC_DATA_OLDVMA_FILEBACKED_PTE].name,
+                 "Oldvma filebacked pte fault time");
+         sprintf(_proc_data[j][PS_PROC_DATA_OLDVMA_FILEBACKED_NOPTE].name,
+                 "Oldvma filebacked nopte fault time");
+         sprintf(_proc_data[j][PS_PROC_DATA_MUNMAP_PROCESSING_TIME].name,
+                 "Munmap processing time");
+         sprintf(_proc_data[j][PS_PROC_DATA_MUNMAP_REQUEST_PROCESSING_TIME].name,
+                 "Munmap request processing time");
+         sprintf(_proc_data[j][PS_PROC_DATA_MPROTECT_PROCESSING_TIME].name,
+                 "Mprotect processing time");
+         sprintf(_proc_data[j][PS_PROC_DATA_MPROTECT_REQUEST_PROCESSING_TIME].name,
+                 "Mprotect request processing time");
+         sprintf(_proc_data[j][PS_PROC_DATA_EXIT_PROCESSING_TIME].name,
+                 "Exit processing time");
+         sprintf(_proc_data[j][PS_PROC_DATA_EXIT_NOTIFICATION_PROCESSING_TIME].name,
+                 "Exit notification processing time");
+         sprintf(_proc_data[j][PS_PROC_DATA_GROUP_EXIT_PROCESSING_TIME].name,
+                 "Group exit processing time");
+         sprintf(_proc_data[j][PS_PROC_DATA_GROUP_EXIT_NOTIFICATION_PROCESSING_TIME].name,
+                 "Group exit notification processing time");
+         sprintf(_proc_data[j][PS_PROC_DATA_IMPORT_TASK_TIME].name,
+                 "Import migrated task information time");
+         sprintf(_proc_data[j][PS_PROC_DATA_COUNT_REMOTE_THREADS_PROCESSING_TIME].name,
+                 "Count remote threads processing time");
+         sprintf(_proc_data[j][PS_PROC_DATA_MK_PAGE_WRITABLE].name,
+                 "Make page writable processing time");
+         sprintf(_proc_data[j][PS_PROC_DATA_WAITING_FOR_LAMPORT_LOCK].name,
+                 "Waiting for Lamport lock on virtual page");
+         sprintf(_proc_data[j][PS_PROC_DATA_LAMPORT_LOCK_HELD].name,
+                 "Lamport lock held");
+     }
+ }
+ #endif
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+ static int handle_stats_clear(struct pcn_kmsg_message* inc_msg) {
+     int i,j;
+     for(j = 0; j < NR_CPUS; j++)
+         for(i = 0; i < PS_PROC_DATA_MAX; i++)
+             proc_data_reset(j,i);
+     pcn_kmsg_free_msg(inc_msg);
+     return 0;
+ }
+ #endif
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+ static void process_stats_query(struct work_struct* w) {
+     stats_response_t* response = kmalloc(sizeof(stats_response_t),GFP_KERNEL);
+     int i;
+     stats_query_work_t* work = (stats_query_work_t*)w;
+     response->header.type = PCN_KMSG_TYPE_PROC_SRV_STATS_RESPONSE;
+     response->header.prio = PCN_KMSG_PRIO_NORMAL;
+     response->pid = work->pid;
+     for(i = 0; i < PS_PROC_DATA_MAX; i++) { 
+         response->data[i].count = _proc_data[_cpu][i].count;
+         response->data[i].total = _proc_data[_cpu][i].total;
+         response->data[i].min   = _proc_data[_cpu][i].min;
+         response->data[i].max   = _proc_data[_cpu][i].max;
+     }
+     pcn_kmsg_send_long(work->from_cpu,
+                         (struct pcn_kmsg_long_message*)response,
+                         sizeof(stats_response_t) - sizeof(response->header));
+     kfree(response);
+     kfree(w);
+ }
+ #endif
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+ static int handle_stats_query(struct pcn_kmsg_message* inc_msg) {
+     stats_query_t* query = (stats_query_t*)inc_msg;
+     stats_query_work_t* work = kmalloc(sizeof(stats_query_work_t),GFP_ATOMIC);
+     if(work) {
+         INIT_WORK( (struct work_struct*)work, process_stats_query);
+         work->pid = query->pid;
+         work->from_cpu = query->header.from_cpu;
+         queue_work(exit_wq, (struct work_struct*)work);
+     }
+     pcn_kmsg_free_msg(inc_msg);
+     return 0;
+ }
+ #endif
+ #ifdef PROCESS_SERVER_HOST_PROC_ENTRY
+ static int handle_stats_response(struct pcn_kmsg_message* inc_msg) {
+     stats_response_t* response = (stats_response_t*)inc_msg;
+     stats_query_data_t* data = find_stats_query_data(response->pid);
+     int from_cpu = response->header.from_cpu;
+     if(data) {
+         int i;
+         for(i = 0; i < PS_PROC_DATA_MAX; i++) {
+             _proc_data[from_cpu][i].count = response->data[i].count;
+             _proc_data[from_cpu][i].total = response->data[i].total;
+             _proc_data[from_cpu][i].min   = response->data[i].min;
+             _proc_data[from_cpu][i].max   = response->data[i].max;
+         }
+         data->responses++;
+     }
+     pcn_kmsg_free_msg(inc_msg);
+     return 0;
+ }
+ #endif
+ /* From Wikipedia page "Fetch and add", modified to work for u64 */
+ /**
+  *
+  */
+ static inline unsigned long fetch_and_add(volatile unsigned long * variable, 
+                       unsigned long value) {
+     asm volatile( 
+              "lock; xaddq %%rax, %2;"
+              :"=a" (value)                   //Output
+              : "a" (value), "m" (*variable)  //Input
+              :"memory" );
+     return value;
+ }
+ /**
+  *
+  */
+ static unsigned long get_next_ts_value() {
+     return fetch_and_add(ts_counter,1);
+ }
+ /**
+  *
+  */
+ static unsigned long long* get_master_ts_counter_address() {
+     get_counter_phys_request_t request;
+     request.header.type = PCN_KMSG_TYPE_PROC_SRV_GET_COUNTER_PHYS_REQUEST;
+     request.header.prio = PCN_KMSG_PRIO_NORMAL;
+     if(!get_counter_phys_data)
+         get_counter_phys_data = kmalloc(sizeof(get_counter_phys_data_t),GFP_KERNEL);
+     get_counter_phys_data->resp = 0;
+     get_counter_phys_data->response_received = 0;
+     pcn_kmsg_send(0,(struct pcn_kmsg_message*)&request);
+     while(!get_counter_phys_data->response_received)
+         schedule();
+      
+     return (unsigned long long*)get_counter_phys_data->resp;
+ }
+ /**
+  *
+  */
+ static void init_shared_counter(void) {
+     if(!_cpu) {
+         // Master allocs space, then shares it
+         void* pg = kmalloc(PAGE_SIZE,GFP_KERNEL);
+         ts_counter = pg;
+         *ts_counter = 0;
+         get_next_ts_value();
+         printk("%s: ts_counter{%lx},*ts_counter{%lx}\n",__func__,
+                 ts_counter,
+                 get_next_ts_value());
+     } else {
+         // ask for physical address of master's ts_counter
+         ts_counter = ioremap_cache(get_master_ts_counter_address(), PAGE_SIZE);
+         printk("%s: ts_counter{%lx},*ts_counter{%lx}\n",__func__,
+                 ts_counter,
+                 get_next_ts_value());
+     }
+ }
+ /**
+  * @brief Initialize this module
+  */
+ static int __init process_server_init(void) {
+     /*
+      * Cache some local information.
+      */
 -    _cpu = smp_processor_id();
 -
++//#ifndef SUPPORT_FOR_CLUSTERING
++           _cpu= smp_processor_id();
 +//#else
 +//       _cpu = cpumask_first(cpu_present_mask);
 +//#endif
      /*
       * Init global semaphores
       */
diff --cc mm/mprotect.c
@@@ -258,6 -260,19 +258,9 @@@ int do_mprotect(struct task_struct* tas
        if (!arch_validate_prot(prot))
                return -EINVAL;
  
 -#ifdef PROCESS_SERVER_ENFORCE_VMA_MOD_ATOMICITY
 -    if(do_remote) {
+         //printk("%s: doing lock\n",__func__);
 -#ifdef PROCESS_SERVER_USE_HEAVY_LOCK
 -        process_server_acquire_heavy_lock();
+ #elif defined(PROCESS_SERVER_USE_DISTRIBUTED_MM_LOCK)
+         process_server_acquire_distributed_mm_lock();
 -#else
 -        process_server_acquire_page_lock_range(start,len);
 -#endif
 -    }
 -#endif
 -
        reqprot = prot;
        /*
         * Does the application expect PROT_READ to imply PROT_EXEC:
@@@ -340,6 -355,18 +343,8 @@@ out
          process_server_do_mprotect(task,start,len,prot);
      }
  
 -#ifdef PROCESS_SERVER_ENFORCE_VMA_MOD_ATOMICITY
 -    if(do_remote) {
 -#ifdef PROCESS_SERVER_USE_HEAVY_LOCK
 -        process_server_release_heavy_lock();
+ #elif defined(PROCESS_SERVER_USE_DISTRIBUTED_MM_LOCK)
+         process_server_release_distributed_mm_lock();
 -#else
 -        process_server_release_page_lock_range(start,len);
 -#endif
 -    }
 -#endif
 -
        return error;
  
  }
diff --cc mm/mremap.c
@@@ -432,6 -432,52 +432,10 @@@ unsigned long do_mremap(unsigned long a
        struct vm_area_struct *vma;
        unsigned long ret = -EINVAL;
        unsigned long charged = 0;
 -    int original_enable_distributed_munmap = current->enable_distributed_munmap;
 -    unsigned long a;
 -    current->enable_distributed_munmap = 0;
 -
 -    // This is kind of tricky.  We have to lock the old range
 -    // and the new range.
 -    // Also, recursion is not an issue for mremap, since 
 -    // process_server does not ever attempt to do distributed
 -    // remaps, it is naughty, and just does a distributed
 -    // munmap (except locally).  That should probably change.
 -#ifdef PROCESS_SERVER_ENFORCE_VMA_MOD_ATOMICITY
+     up_write(&mm->mmap_sem);
 -#ifdef PROCESS_SERVER_USE_HEAVY_LOCK
 -    process_server_acquire_heavy_lock();
+ #elif defined(PROCESS_SERVER_USE_DISTRIBUTED_MM_LOCK)
+     process_server_acquire_distributed_mm_lock();
 -#else 
 -    {
 -    unsigned long old_start = addr;
 -    unsigned long old_end   = addr + old_len;
 -    unsigned long new_start = new_addr;
 -    unsigned long new_end   = new_addr + new_len;
 -    if(old_end <= new_start || new_end <= old_start) {
 -        process_server_acquire_page_lock_range(old_start,old_len);
 -        process_server_acquire_page_lock_range(new_start,new_len);
 -    } else {
 -        unsigned long min_start = old_start < new_start? old_start : new_start;
 -        unsigned long max_end   = old_end > new_end? old_end : new_end;
 -        process_server_acquire_page_lock_range(min_start,max_end - min_start);
 -    }
 -    }
 -#endif
+     down_write(&mm->mmap_sem);
 -#endif
 -
 -    // Pull in all remote mappings so nothing is lost later.
 -    for(a = addr & PAGE_MASK; a < addr + old_len; a+= PAGE_SIZE) {
 -        struct vm_area_struct *vma_out = NULL;
 -        process_server_pull_remote_mappings(current->mm,
 -                                            NULL,
 -                                            a,
 -                                            NULL,
 -                                            &vma_out,
 -                                            NULL);
 -
 -    }
  
        if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
                goto out;
  out:
        if (ret & ~PAGE_MASK)
                vm_unacct_memory(charged);
 -#ifdef PROCESS_SERVER_ENFORCE_VMA_MOD_ATOMICITY
 -#ifdef PROCESS_SERVER_USE_HEAVY_LOCK
 -    process_server_release_heavy_lock();
+ #elif defined(PROCESS_SERVER_USE_DISTRIBUTED_MM_LOCK)
+     process_server_release_distributed_mm_lock();
 -#else
 -    {
 -    unsigned long old_start = addr;
 -    unsigned long old_end   = addr + old_len;
 -    unsigned long new_start = new_addr;
 -    unsigned long new_end   = new_addr + new_len;
 -    if(old_end <= new_start || new_end <= old_start) {
 -        process_server_release_page_lock_range(old_start,old_len);
 -        process_server_release_page_lock_range(new_start,new_len);
 -    } else {
 -        unsigned long min_start = old_start < new_start? old_start : new_start;
 -        unsigned long max_end   = old_end > new_end? old_end : new_end;
 -        process_server_release_page_lock_range(min_start,max_end - min_start);
 -    }
 -
 -    }
 -#endif
 -#endif
 -
 -    current->enable_distributed_munmap = original_enable_distributed_munmap;
 -
        return ret;
  }