diff options
author | Christian Heim <phreak@gentoo.org> | 2006-01-31 13:58:45 +0000 |
---|---|---|
committer | Christian Heim <phreak@gentoo.org> | 2006-01-31 13:58:45 +0000 |
commit | 1802889f8e8799c21814c1f67a56e30606bf4e12 (patch) | |
tree | f82300abe2caabb3787edd12905ded731375ac25 /openvz-sources | |
parent | Adding latest stable patchset release for openvz (diff) | |
download | misc-1802889f8e8799c21814c1f67a56e30606bf4e12.tar.gz misc-1802889f8e8799c21814c1f67a56e30606bf4e12.tar.bz2 misc-1802889f8e8799c21814c1f67a56e30606bf4e12.zip |
Adding latest development patchset release for openvz
svn path=/; revision=222
Diffstat (limited to 'openvz-sources')
-rw-r--r-- | openvz-sources/025.014/0001_linux-2.6.0-nonintconfig.patch | 99 | ||||
-rw-r--r-- | openvz-sources/025.014/0100_patch-025stab014-core.patch | 48347 |
2 files changed, 48446 insertions, 0 deletions
diff --git a/openvz-sources/025.014/0001_linux-2.6.0-nonintconfig.patch b/openvz-sources/025.014/0001_linux-2.6.0-nonintconfig.patch new file mode 100644 index 0000000..a7fe97d --- /dev/null +++ b/openvz-sources/025.014/0001_linux-2.6.0-nonintconfig.patch @@ -0,0 +1,99 @@ +--- ./scripts/kconfig/Makefile.nonint 2006-01-03 06:21:10.000000000 +0300 ++++ ./scripts/kconfig/Makefile 2006-01-16 16:59:19.000000000 +0300 +@@ -42,6 +42,10 @@ update-po-config: $(obj)/kxgettext + $(Q)rm -f arch/um/Kconfig_arch + $(Q)rm -f scripts/kconfig/linux_*.pot scripts/kconfig/config.pot + ++nonint_oldconfig: scripts/kconfig/conf ++ ./scripts/kconfig/conf -b arch/$(ARCH)/Kconfig ++ ++ + .PHONY: randconfig allyesconfig allnoconfig allmodconfig defconfig + + randconfig: $(obj)/conf +--- ./scripts/kconfig/conf.c.nonint 2006-01-03 06:21:10.000000000 +0300 ++++ ./scripts/kconfig/conf.c 2006-01-16 16:10:30.000000000 +0300 +@@ -20,6 +20,7 @@ enum { + ask_all, + ask_new, + ask_silent, ++ dont_ask, + set_default, + set_yes, + set_mod, +@@ -36,6 +37,8 @@ static struct menu *rootEntry; + + static char nohelp_text[] = N_("Sorry, no help available for this option yet.\n"); + ++static int return_value = 0; ++ + static void strip(char *str) + { + char *p = str; +@@ -102,6 +105,12 @@ static void conf_askvalue(struct symbol + fflush(stdout); + fgets(line, 128, stdin); + return; ++ case dont_ask: ++ if (!sym_has_value(sym)) { ++ fprintf(stderr,"CONFIG_%s\n",sym->name); ++ return_value++; ++ } ++ return; + case set_default: + printf("%s\n", def); + return; +@@ -346,6 +355,10 @@ static int conf_choice(struct menu *menu + printf("?"); + printf("]: "); + switch (input_mode) { ++ case dont_ask: ++ cnt = def; ++ printf("%d\n", cnt); ++ break; + case ask_new: + case ask_silent: + if (!is_new) { +@@ -482,7 +495,10 @@ static void check_conf(struct menu *menu + if (!conf_cnt++) + printf(_("*\n* Restart config...\n*\n")); + rootEntry = menu_get_parent_menu(menu); +- conf(rootEntry); ++ if (input_mode == dont_ask) ++ fprintf(stderr,"CONFIG_%s\n",sym->name); ++ else ++ conf(rootEntry); + } + } + +@@ -501,6 +517,9 @@ int main(int ac, char **av) + case 'o': + input_mode = ask_new; + break; ++ case 'b': ++ input_mode = dont_ask; ++ break; + case 's': + input_mode = ask_silent; + valid_stdin = isatty(0) && isatty(1) && isatty(2); +@@ -565,6 +584,7 @@ int main(int ac, char **av) + } + case ask_all: + case ask_new: ++ case dont_ask: + conf_read(NULL); + break; + case set_no: +@@ -603,10 +623,10 @@ int main(int ac, char **av) + do { + conf_cnt = 0; + check_conf(&rootmenu); +- } while (conf_cnt); ++ } while ((conf_cnt) && (input_mode != dont_ask)); + if (conf_write(NULL)) { + fprintf(stderr, _("\n*** Error during writing of the kernel configuration.\n\n")); + return 1; + } +- return 0; ++ return return_value; + } diff --git a/openvz-sources/025.014/0100_patch-025stab014-core.patch b/openvz-sources/025.014/0100_patch-025stab014-core.patch new file mode 100644 index 0000000..78bf588 --- /dev/null +++ b/openvz-sources/025.014/0100_patch-025stab014-core.patch @@ -0,0 +1,48347 @@ +diff -uprN linux-2.6.15.orig/Makefile linux-2.6.15-ve025stab014/Makefile +--- linux-2.6.15.orig/Makefile 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/Makefile 2006-01-27 14:48:09.000000000 +0300 +@@ -1,7 +1,7 @@ + VERSION = 2 + PATCHLEVEL = 6 + SUBLEVEL = 15 +-EXTRAVERSION = ++EXTRAVERSION = -025stab014 + NAME=Sliding Snow Leopard + + # *DOCUMENTATION* +diff -uprN linux-2.6.15.orig/arch/i386/Kconfig linux-2.6.15-ve025stab014/arch/i386/Kconfig +--- linux-2.6.15.orig/arch/i386/Kconfig 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/i386/Kconfig 2006-01-27 14:48:07.000000000 +0300 +@@ -1014,12 +1014,18 @@ endmenu + + source "arch/i386/Kconfig.debug" + ++menu "OpenVZ" ++source "kernel/Kconfig.openvz" ++endmenu ++ + source "security/Kconfig" + + source "crypto/Kconfig" + + source "lib/Kconfig" + ++source "kernel/ub/Kconfig" ++ + # + # Use the generic interrupt handling code in kernel/irq/: + # +diff -uprN linux-2.6.15.orig/arch/i386/kernel/apic.c linux-2.6.15-ve025stab014/arch/i386/kernel/apic.c +--- linux-2.6.15.orig/arch/i386/kernel/apic.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/i386/kernel/apic.c 2006-01-27 14:48:07.000000000 +0300 +@@ -1185,6 +1185,7 @@ inline void smp_local_timer_interrupt(st + fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) + { + int cpu = smp_processor_id(); ++ struct ve_struct *ve; + + /* + * the NMI deadlock-detector uses this. +@@ -1201,9 +1202,11 @@ fastcall void smp_apic_timer_interrupt(s + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ ++ ve = set_exec_env(get_ve0()); + irq_enter(); + smp_local_timer_interrupt(regs); + irq_exit(); ++ (void)set_exec_env(ve); + } + + /* +diff -uprN linux-2.6.15.orig/arch/i386/kernel/cpu/mtrr/if.c linux-2.6.15-ve025stab014/arch/i386/kernel/cpu/mtrr/if.c +--- linux-2.6.15.orig/arch/i386/kernel/cpu/mtrr/if.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/i386/kernel/cpu/mtrr/if.c 2006-01-27 14:48:07.000000000 +0300 +@@ -391,7 +391,7 @@ static int __init mtrr_if_init(void) + return -ENODEV; + + proc_root_mtrr = +- create_proc_entry("mtrr", S_IWUSR | S_IRUGO, &proc_root); ++ create_proc_entry("mtrr", S_IWUSR | S_IRUGO, NULL); + if (proc_root_mtrr) { + proc_root_mtrr->owner = THIS_MODULE; + proc_root_mtrr->proc_fops = &mtrr_fops; +diff -uprN linux-2.6.15.orig/arch/i386/kernel/irq.c linux-2.6.15-ve025stab014/arch/i386/kernel/irq.c +--- linux-2.6.15.orig/arch/i386/kernel/irq.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/i386/kernel/irq.c 2006-01-27 14:48:07.000000000 +0300 +@@ -59,7 +59,9 @@ fastcall unsigned int do_IRQ(struct pt_r + union irq_ctx *curctx, *irqctx; + u32 *isp; + #endif ++ struct ve_struct *ve; + ++ ve = set_exec_env(get_ve0()); + irq_enter(); + #ifdef CONFIG_DEBUG_STACKOVERFLOW + /* Debugging check for stack overflow: is there less than 1KB free? */ +@@ -108,6 +110,7 @@ fastcall unsigned int do_IRQ(struct pt_r + __do_IRQ(irq, regs); + + irq_exit(); ++ (void)set_exec_env(ve); + + return 1; + } +diff -uprN linux-2.6.15.orig/arch/i386/kernel/ldt.c linux-2.6.15-ve025stab014/arch/i386/kernel/ldt.c +--- linux-2.6.15.orig/arch/i386/kernel/ldt.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/i386/kernel/ldt.c 2006-01-27 14:48:05.000000000 +0300 +@@ -20,6 +20,8 @@ + #include <asm/desc.h> + #include <asm/mmu_context.h> + ++#include <ub/ub_mem.h> ++ + #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ + static void flush_ldt(void *null) + { +@@ -39,9 +41,9 @@ static int alloc_ldt(mm_context_t *pc, i + oldsize = pc->size; + mincount = (mincount+511)&(~511); + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) +- newldt = vmalloc(mincount*LDT_ENTRY_SIZE); ++ newldt = ub_vmalloc(mincount*LDT_ENTRY_SIZE); + else +- newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); ++ newldt = ub_kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; +diff -uprN linux-2.6.15.orig/arch/i386/kernel/process.c linux-2.6.15-ve025stab014/arch/i386/kernel/process.c +--- linux-2.6.15.orig/arch/i386/kernel/process.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/i386/kernel/process.c 2006-01-27 14:48:07.000000000 +0300 +@@ -338,6 +338,13 @@ int kernel_thread(int (*fn)(void *), voi + { + struct pt_regs regs; + ++ /* Don't allow kernel_thread() inside VE */ ++ if (!ve_is_super(get_exec_env())) { ++ printk("kernel_thread call inside VE\n"); ++ dump_stack(); ++ return -EPERM; ++ } ++ + memset(®s, 0, sizeof(regs)); + + regs.ebx = (unsigned long) fn; +diff -uprN linux-2.6.15.orig/arch/i386/kernel/signal.c linux-2.6.15-ve025stab014/arch/i386/kernel/signal.c +--- linux-2.6.15.orig/arch/i386/kernel/signal.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/i386/kernel/signal.c 2006-01-27 14:48:05.000000000 +0300 +@@ -615,7 +615,7 @@ int fastcall do_signal(struct pt_regs *r + if (!user_mode(regs)) + return 1; + +- if (try_to_freeze()) ++ if (try_to_freeze() && !signal_pending(current)) + goto no_signal; + + if (!oldset) +diff -uprN linux-2.6.15.orig/arch/i386/kernel/smpboot.c linux-2.6.15-ve025stab014/arch/i386/kernel/smpboot.c +--- linux-2.6.15.orig/arch/i386/kernel/smpboot.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/i386/kernel/smpboot.c 2006-01-27 14:48:07.000000000 +0300 +@@ -321,6 +321,10 @@ static void __init synchronize_tsc_bp (v + } + if (!buggy) + printk("passed.\n"); ++#ifdef CONFIG_VE ++ /* TSC reset. kill whatever might rely on old values */ ++ VE_TASK_INFO(current)->wakeup_stamp = 0; ++#endif + } + + static void __init synchronize_tsc_ap (void) +@@ -346,6 +350,10 @@ static void __init synchronize_tsc_ap (v + atomic_inc(&tsc_count_stop); + while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb(); + } ++#ifdef CONFIG_VE ++ /* TSC reset. kill whatever might rely on old values */ ++ VE_TASK_INFO(current)->wakeup_stamp = 0; ++#endif + } + #undef NR_LOOPS + +@@ -913,6 +921,13 @@ static int __devinit do_boot_cpu(int api + if (IS_ERR(idle)) + panic("failed fork for CPU %d", cpu); + idle->thread.eip = (unsigned long) start_secondary; ++ ++#ifdef CONFIG_VE ++ /* Cosmetic: sleep_time won't be changed afterwards for the idle ++ * thread; keep it 0 rather than -cycles. */ ++ VE_TASK_INFO(idle)->sleep_time = 0; ++#endif ++ + /* start_eip had better be page-aligned! */ + start_eip = setup_trampoline(); + +diff -uprN linux-2.6.15.orig/arch/i386/kernel/sys_i386.c linux-2.6.15-ve025stab014/arch/i386/kernel/sys_i386.c +--- linux-2.6.15.orig/arch/i386/kernel/sys_i386.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/i386/kernel/sys_i386.c 2006-01-27 14:48:07.000000000 +0300 +@@ -217,7 +217,7 @@ asmlinkage int sys_uname(struct old_utsn + if (!name) + return -EFAULT; + down_read(&uts_sem); +- err=copy_to_user(name, &system_utsname, sizeof (*name)); ++ err=copy_to_user(name, &ve_utsname, sizeof (*name)); + up_read(&uts_sem); + return err?-EFAULT:0; + } +@@ -233,15 +233,15 @@ asmlinkage int sys_olduname(struct oldol + + down_read(&uts_sem); + +- error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); ++ error = __copy_to_user(name->sysname,ve_utsname.sysname,__OLD_UTS_LEN); + error |= __put_user(0,name->sysname+__OLD_UTS_LEN); +- error |= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); ++ error |= __copy_to_user(name->nodename,ve_utsname.nodename,__OLD_UTS_LEN); + error |= __put_user(0,name->nodename+__OLD_UTS_LEN); +- error |= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); ++ error |= __copy_to_user(name->release,ve_utsname.release,__OLD_UTS_LEN); + error |= __put_user(0,name->release+__OLD_UTS_LEN); +- error |= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); ++ error |= __copy_to_user(name->version,ve_utsname.version,__OLD_UTS_LEN); + error |= __put_user(0,name->version+__OLD_UTS_LEN); +- error |= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN); ++ error |= __copy_to_user(name->machine,ve_utsname.machine,__OLD_UTS_LEN); + error |= __put_user(0,name->machine+__OLD_UTS_LEN); + + up_read(&uts_sem); +diff -uprN linux-2.6.15.orig/arch/i386/kernel/syscall_table.S linux-2.6.15-ve025stab014/arch/i386/kernel/syscall_table.S +--- linux-2.6.15.orig/arch/i386/kernel/syscall_table.S 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/i386/kernel/syscall_table.S 2006-01-27 14:48:05.000000000 +0300 +@@ -294,3 +294,11 @@ ENTRY(sys_call_table) + .long sys_inotify_init + .long sys_inotify_add_watch + .long sys_inotify_rm_watch ++ .rept 510-(.-sys_call_table)/4 ++ .long sys_ni_syscall ++ .endr ++ ++ .long sys_getluid /* 510 */ ++ .long sys_setluid ++ .long sys_setublimit ++ .long sys_ubstat +diff -uprN linux-2.6.15.orig/arch/i386/kernel/timers/timer_tsc.c linux-2.6.15-ve025stab014/arch/i386/kernel/timers/timer_tsc.c +--- linux-2.6.15.orig/arch/i386/kernel/timers/timer_tsc.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/i386/kernel/timers/timer_tsc.c 2006-01-27 14:48:07.000000000 +0300 +@@ -85,7 +85,7 @@ static int count2; /* counter for mark_o + * Equal to 2^32 * (1 / (clocks per usec) ). + * Initialized in time_init. + */ +-static unsigned long fast_gettimeoffset_quotient; ++unsigned long fast_gettimeoffset_quotient; + + static unsigned long get_offset_tsc(void) + { +diff -uprN linux-2.6.15.orig/arch/i386/kernel/traps.c linux-2.6.15-ve025stab014/arch/i386/kernel/traps.c +--- linux-2.6.15.orig/arch/i386/kernel/traps.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/i386/kernel/traps.c 2006-01-27 14:48:07.000000000 +0300 +@@ -227,8 +227,11 @@ void show_registers(struct pt_regs *regs + regs->esi, regs->edi, regs->ebp, esp); + printk("ds: %04x es: %04x ss: %04x\n", + regs->xds & 0xffff, regs->xes & 0xffff, ss); +- printk("Process %s (pid: %d, threadinfo=%p task=%p)", +- current->comm, current->pid, current_thread_info(), current); ++ printk("Process %s (pid: %d, veid=%d, threadinfo=%p task=%p)", ++ current->comm, current->pid, ++ VEID(VE_TASK_INFO(current)->owner_env), ++ current_thread_info(), current); ++ + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. +@@ -291,6 +294,13 @@ bug: + printk("Kernel BUG\n"); + } + ++static void inline check_kernel_csum_bug(void) ++{ ++ if (kernel_text_csum_broken) ++ printk("Kernel code checksum mismatch detected %d times\n", ++ kernel_text_csum_broken); ++} ++ + /* This is gone through when something in the kernel + * has done something bad and is about to be terminated. + */ +@@ -337,6 +347,7 @@ void die(const char * str, struct pt_reg + show_registers(regs); + } else + printk(KERN_ERR "Recursive die() failure, output suppressed\n"); ++ check_kernel_csum_bug(); + + bust_spinlocks(0); + die.lock_owner = -1; +diff -uprN linux-2.6.15.orig/arch/i386/mm/fault.c linux-2.6.15-ve025stab014/arch/i386/mm/fault.c +--- linux-2.6.15.orig/arch/i386/mm/fault.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/i386/mm/fault.c 2006-01-27 14:48:06.000000000 +0300 +@@ -347,7 +347,6 @@ good_area: + goto bad_area; + } + +- survive: + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo +@@ -485,14 +484,14 @@ no_context: + */ + out_of_memory: + up_read(&mm->mmap_sem); +- if (tsk->pid == 1) { +- yield(); +- down_read(&mm->mmap_sem); +- goto survive; +- } +- printk("VM: killing process %s\n", tsk->comm); +- if (error_code & 4) +- do_exit(SIGKILL); ++ if (error_code & 4) { ++ /* ++ * 0-order allocation always success if something really ++ * fatal not happen: beancounter overdraft or OOM. ++ */ ++ force_sig(SIGKILL, tsk); ++ return; ++ } + goto no_context; + + do_sigbus: +diff -uprN linux-2.6.15.orig/arch/i386/mm/init.c linux-2.6.15-ve025stab014/arch/i386/mm/init.c +--- linux-2.6.15.orig/arch/i386/mm/init.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/i386/mm/init.c 2006-01-27 14:48:05.000000000 +0300 +@@ -677,7 +677,7 @@ void __init pgtable_cache_init(void) + pmd_cache = kmem_cache_create("pmd", + PTRS_PER_PMD*sizeof(pmd_t), + PTRS_PER_PMD*sizeof(pmd_t), +- 0, ++ SLAB_UBC, + pmd_ctor, + NULL); + if (!pmd_cache) +@@ -686,7 +686,7 @@ void __init pgtable_cache_init(void) + pgd_cache = kmem_cache_create("pgd", + PTRS_PER_PGD*sizeof(pgd_t), + PTRS_PER_PGD*sizeof(pgd_t), +- 0, ++ SLAB_UBC, + pgd_ctor, + PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + if (!pgd_cache) +diff -uprN linux-2.6.15.orig/arch/i386/mm/pgtable.c linux-2.6.15-ve025stab014/arch/i386/mm/pgtable.c +--- linux-2.6.15.orig/arch/i386/mm/pgtable.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/i386/mm/pgtable.c 2006-01-27 14:48:07.000000000 +0300 +@@ -5,8 +5,10 @@ + #include <linux/config.h> + #include <linux/sched.h> + #include <linux/kernel.h> ++#include <linux/module.h> + #include <linux/errno.h> + #include <linux/mm.h> ++#include <linux/vmalloc.h> + #include <linux/swap.h> + #include <linux/smp.h> + #include <linux/highmem.h> +@@ -64,7 +66,9 @@ void show_mem(void) + printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped); + printk(KERN_INFO "%lu pages slab\n", ps.nr_slab); + printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages); ++ vprintstat(); + } ++EXPORT_SYMBOL(show_mem); + + /* + * Associate a virtual page frame with a given physical page frame +@@ -159,9 +163,11 @@ struct page *pte_alloc_one(struct mm_str + struct page *pte; + + #ifdef CONFIG_HIGHPTE +- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); ++ pte = alloc_pages(GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_HIGHMEM| ++ __GFP_REPEAT|__GFP_ZERO, 0); + #else +- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); ++ pte = alloc_pages(GFP_KERNEL_UBC|__GFP_SOFT_UBC| ++ __GFP_REPEAT|__GFP_ZERO, 0); + #endif + return pte; + } +diff -uprN linux-2.6.15.orig/arch/ia64/Kconfig linux-2.6.15-ve025stab014/arch/ia64/Kconfig +--- linux-2.6.15.orig/arch/ia64/Kconfig 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ia64/Kconfig 2006-01-27 14:48:07.000000000 +0300 +@@ -461,6 +461,10 @@ endmenu + + source "arch/ia64/Kconfig.debug" + ++source "kernel/Kconfig.openvz" ++ + source "security/Kconfig" + + source "crypto/Kconfig" ++ ++source "kernel/ub/Kconfig" +diff -uprN linux-2.6.15.orig/arch/ia64/ia32/binfmt_elf32.c linux-2.6.15-ve025stab014/arch/ia64/ia32/binfmt_elf32.c +--- linux-2.6.15.orig/arch/ia64/ia32/binfmt_elf32.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ia64/ia32/binfmt_elf32.c 2006-01-27 14:48:05.000000000 +0300 +@@ -136,6 +136,12 @@ ia64_elf32_init (struct pt_regs *regs) + up_write(¤t->mm->mmap_sem); + } + ++ if (ub_memory_charge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES * ++ IA32_LDT_ENTRY_SIZE), ++ VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE, ++ NULL, UB_SOFT)) ++ goto skip; ++ + /* + * Install LDT as anonymous memory. This gives us all-zero segment descriptors + * until a task modifies them via modify_ldt(). +@@ -157,7 +163,12 @@ ia64_elf32_init (struct pt_regs *regs) + } + } + up_write(¤t->mm->mmap_sem); +- } ++ } else ++ ub_memory_uncharge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES * ++ IA32_LDT_ENTRY_SIZE), ++ VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE, NULL); ++ ++skip: + + ia64_psr(regs)->ac = 0; /* turn off alignment checking */ + regs->loadrs = 0; +@@ -212,9 +223,15 @@ ia32_setup_arg_pages (struct linux_binpr + bprm->loader += stack_base; + bprm->exec += stack_base; + ++ ret = -ENOMEM; ++ if (ub_memory_charge(mm, IA32_STACK_TOP - ++ (PAGE_MASK & (unsigned long)bprm->p), ++ VM_STACK_FLAGS, NULL, UB_SOFT)) ++ goto err_charge; ++ + mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!mpnt) +- return -ENOMEM; ++ goto err_alloc; + + memset(mpnt, 0, sizeof(*mpnt)); + +@@ -231,11 +248,8 @@ ia32_setup_arg_pages (struct linux_binpr + mpnt->vm_flags = VM_STACK_FLAGS; + mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC)? + PAGE_COPY_EXEC: PAGE_COPY; +- if ((ret = insert_vm_struct(current->mm, mpnt))) { +- up_write(¤t->mm->mmap_sem); +- kmem_cache_free(vm_area_cachep, mpnt); +- return ret; +- } ++ if ((ret = insert_vm_struct(current->mm, mpnt))) ++ goto err_insert; + current->mm->stack_vm = current->mm->total_vm = vma_pages(mpnt); + } + +@@ -254,6 +268,16 @@ ia32_setup_arg_pages (struct linux_binpr + current->thread.ppl = ia32_init_pp_list(); + + return 0; ++ ++err_insert: ++ up_write(¤t->mm->mmap_sem); ++ kmem_cache_free(vm_area_cachep, mpnt); ++err_alloc: ++ ub_memory_uncharge(mm, IA32_STACK_TOP - ++ (PAGE_MASK & (unsigned long)bprm->p), ++ VM_STACK_FLAGS, NULL); ++err_charge: ++ return ret; + } + + static void +diff -uprN linux-2.6.15.orig/arch/ia64/ia32/sys_ia32.c linux-2.6.15-ve025stab014/arch/ia64/ia32/sys_ia32.c +--- linux-2.6.15.orig/arch/ia64/ia32/sys_ia32.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ia64/ia32/sys_ia32.c 2006-01-27 14:48:07.000000000 +0300 +@@ -1767,7 +1767,7 @@ sys32_ptrace (int request, pid_t pid, un + + ret = -ESRCH; + read_lock(&tasklist_lock); +- child = find_task_by_pid(pid); ++ child = find_task_by_pid_ve(pid); + if (child) + get_task_struct(child); + read_unlock(&tasklist_lock); +diff -uprN linux-2.6.15.orig/arch/ia64/kernel/asm-offsets.c linux-2.6.15-ve025stab014/arch/ia64/kernel/asm-offsets.c +--- linux-2.6.15.orig/arch/ia64/kernel/asm-offsets.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ia64/kernel/asm-offsets.c 2006-01-27 14:48:07.000000000 +0300 +@@ -44,11 +44,21 @@ void foo(void) + DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid)); + DEFINE(IA64_TASK_GROUP_LEADER_OFFSET, offsetof (struct task_struct, group_leader)); + DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending)); ++#ifdef CONFIG_VE ++ DEFINE(IA64_TASK_PID_OFFSET, offsetof ++ (struct task_struct, pids[PIDTYPE_PID].vnr)); ++#else + DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, pid)); ++#endif + DEFINE(IA64_TASK_REAL_PARENT_OFFSET, offsetof (struct task_struct, real_parent)); + DEFINE(IA64_TASK_SIGHAND_OFFSET,offsetof (struct task_struct, sighand)); + DEFINE(IA64_TASK_SIGNAL_OFFSET,offsetof (struct task_struct, signal)); ++#ifdef CONFIG_VE ++ DEFINE(IA64_TASK_TGID_OFFSET, offsetof ++ (struct task_struct, pids[PIDTYPE_TGID].vnr)); ++#else + DEFINE(IA64_TASK_TGID_OFFSET, offsetof (struct task_struct, tgid)); ++#endif + DEFINE(IA64_TASK_THREAD_KSP_OFFSET, offsetof (struct task_struct, thread.ksp)); + DEFINE(IA64_TASK_THREAD_ON_USTACK_OFFSET, offsetof (struct task_struct, thread.on_ustack)); + +diff -uprN linux-2.6.15.orig/arch/ia64/kernel/entry.S linux-2.6.15-ve025stab014/arch/ia64/kernel/entry.S +--- linux-2.6.15.orig/arch/ia64/kernel/entry.S 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ia64/kernel/entry.S 2006-01-27 14:48:05.000000000 +0300 +@@ -1600,5 +1600,12 @@ sys_call_table: + data8 sys_inotify_init + data8 sys_inotify_add_watch + data8 sys_inotify_rm_watch ++.rept 1505-1280 ++ data8 sys_ni_syscall // 1280 - 1499 ++.endr ++ data8 sys_getluid // 1505 ++ data8 sys_setluid ++ data8 sys_setublimit ++ data8 sys_ubstat + + .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls +diff -uprN linux-2.6.15.orig/arch/ia64/kernel/fsys.S linux-2.6.15-ve025stab014/arch/ia64/kernel/fsys.S +--- linux-2.6.15.orig/arch/ia64/kernel/fsys.S 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ia64/kernel/fsys.S 2006-01-27 14:48:07.000000000 +0300 +@@ -72,6 +72,7 @@ ENTRY(fsys_getpid) + FSYS_RETURN + END(fsys_getpid) + ++#ifndef CONFIG_VE + ENTRY(fsys_getppid) + .prologue + .altrp b6 +@@ -118,6 +119,7 @@ ENTRY(fsys_getppid) + #endif + FSYS_RETURN + END(fsys_getppid) ++#endif + + ENTRY(fsys_set_tid_address) + .prologue +@@ -665,7 +667,11 @@ fsyscall_table: + data8 0 // chown + data8 0 // lseek // 1040 + data8 fsys_getpid // getpid ++#ifdef CONFIG_VE ++ data8 0 ++#else + data8 fsys_getppid // getppid ++#endif + data8 0 // mount + data8 0 // umount + data8 0 // setuid // 1045 +diff -uprN linux-2.6.15.orig/arch/ia64/kernel/irq.c linux-2.6.15-ve025stab014/arch/ia64/kernel/irq.c +--- linux-2.6.15.orig/arch/ia64/kernel/irq.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ia64/kernel/irq.c 2006-01-27 14:48:07.000000000 +0300 +@@ -163,7 +163,9 @@ void fixup_irqs(void) + { + unsigned int irq; + extern void ia64_process_pending_intr(void); ++ struct ve_struct *ve; + ++ ve = set_exec_env(get_ve0()); + ia64_set_itv(1<<16); + /* + * Phase 1: Locate irq's bound to this cpu and +@@ -197,5 +199,6 @@ void fixup_irqs(void) + */ + max_xtp(); + local_irq_disable(); ++ (void)set_exec_env(ve); + } + #endif +diff -uprN linux-2.6.15.orig/arch/ia64/kernel/irq_ia64.c linux-2.6.15-ve025stab014/arch/ia64/kernel/irq_ia64.c +--- linux-2.6.15.orig/arch/ia64/kernel/irq_ia64.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ia64/kernel/irq_ia64.c 2006-01-27 14:48:07.000000000 +0300 +@@ -103,6 +103,7 @@ void + ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) + { + unsigned long saved_tpr; ++ struct ve_struct *ve; + + #if IRQ_DEBUG + { +@@ -139,6 +140,7 @@ ia64_handle_irq (ia64_vector vector, str + * 16 (without this, it would be ~240, which could easily lead + * to kernel stack overflows). + */ ++ ve = set_exec_env(get_ve0()); + irq_enter(); + saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); + ia64_srlz_d(); +@@ -164,6 +166,7 @@ ia64_handle_irq (ia64_vector vector, str + * come through until ia64_eoi() has been done. + */ + irq_exit(); ++ (void)set_exec_env(get_ve0()); + } + + #ifdef CONFIG_HOTPLUG_CPU +@@ -176,9 +179,11 @@ void ia64_process_pending_intr(void) + ia64_vector vector; + unsigned long saved_tpr; + extern unsigned int vectors_in_migration[NR_IRQS]; ++ struct ve_struct *ve; + + vector = ia64_get_ivr(); + ++ ve = set_exec_env(get_ve0()); + irq_enter(); + saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); + ia64_srlz_d(); +@@ -210,6 +215,7 @@ void ia64_process_pending_intr(void) + vector = ia64_get_ivr(); + } + irq_exit(); ++ (void)set_exec_env(ve); + } + #endif + +diff -uprN linux-2.6.15.orig/arch/ia64/kernel/mca.c linux-2.6.15-ve025stab014/arch/ia64/kernel/mca.c +--- linux-2.6.15.orig/arch/ia64/kernel/mca.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ia64/kernel/mca.c 2006-01-27 14:48:07.000000000 +0300 +@@ -1241,10 +1241,10 @@ default_monarch_init_process(struct noti + } + printk("\n\n"); + if (read_trylock(&tasklist_lock)) { +- do_each_thread (g, t) { ++ do_each_thread_all (g, t) { + printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm); + show_stack(t, NULL); +- } while_each_thread (g, t); ++ } while_each_thread_all (g, t); + read_unlock(&tasklist_lock); + } + return NOTIFY_DONE; +diff -uprN linux-2.6.15.orig/arch/ia64/kernel/perfmon.c linux-2.6.15-ve025stab014/arch/ia64/kernel/perfmon.c +--- linux-2.6.15.orig/arch/ia64/kernel/perfmon.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ia64/kernel/perfmon.c 2006-01-27 14:48:07.000000000 +0300 +@@ -2620,7 +2620,7 @@ pfm_get_task(pfm_context_t *ctx, pid_t p + + read_lock(&tasklist_lock); + +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + + /* make sure task cannot go away while we operate on it */ + if (p) get_task_struct(p); +@@ -4184,12 +4184,12 @@ pfm_check_task_exist(pfm_context_t *ctx) + + read_lock(&tasklist_lock); + +- do_each_thread (g, t) { ++ do_each_thread_ve (g, t) { + if (t->thread.pfm_context == ctx) { + ret = 0; + break; + } +- } while_each_thread (g, t); ++ } while_each_thread_ve (g, t); + + read_unlock(&tasklist_lock); + +diff -uprN linux-2.6.15.orig/arch/ia64/kernel/process.c linux-2.6.15-ve025stab014/arch/ia64/kernel/process.c +--- linux-2.6.15.orig/arch/ia64/kernel/process.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ia64/kernel/process.c 2006-01-27 14:48:07.000000000 +0300 +@@ -681,6 +681,13 @@ kernel_thread (int (*fn)(void *), void * + struct pt_regs pt; + } regs; + ++ /* Don't allow kernel_thread() inside VE */ ++ if (!ve_is_super(get_exec_env())) { ++ printk("kernel_thread call inside VE\n"); ++ dump_stack(); ++ return -EPERM; ++ } ++ + memset(®s, 0, sizeof(regs)); + regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ + regs.pt.r1 = helper_fptr[1]; /* set GP */ +diff -uprN linux-2.6.15.orig/arch/ia64/kernel/ptrace.c linux-2.6.15-ve025stab014/arch/ia64/kernel/ptrace.c +--- linux-2.6.15.orig/arch/ia64/kernel/ptrace.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ia64/kernel/ptrace.c 2006-01-27 14:48:07.000000000 +0300 +@@ -1440,7 +1440,7 @@ sys_ptrace (long request, pid_t pid, uns + ret = -ESRCH; + read_lock(&tasklist_lock); + { +- child = find_task_by_pid(pid); ++ child = find_task_by_pid_ve(pid); + if (child) { + if (peek_or_poke) + child = find_thread_for_addr(child, addr); +diff -uprN linux-2.6.15.orig/arch/ia64/kernel/signal.c linux-2.6.15-ve025stab014/arch/ia64/kernel/signal.c +--- linux-2.6.15.orig/arch/ia64/kernel/signal.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ia64/kernel/signal.c 2006-01-27 14:48:07.000000000 +0300 +@@ -270,7 +270,7 @@ ia64_rt_sigreturn (struct sigscratch *sc + si.si_signo = SIGSEGV; + si.si_errno = 0; + si.si_code = SI_KERNEL; +- si.si_pid = current->pid; ++ si.si_pid = virt_pid(current); + si.si_uid = current->uid; + si.si_addr = sc; + force_sig_info(SIGSEGV, &si, current); +@@ -375,7 +375,7 @@ force_sigsegv_info (int sig, void __user + si.si_signo = SIGSEGV; + si.si_errno = 0; + si.si_code = SI_KERNEL; +- si.si_pid = current->pid; ++ si.si_pid = virt_pid(current); + si.si_uid = current->uid; + si.si_addr = addr; + force_sig_info(SIGSEGV, &si, current); +@@ -641,7 +641,7 @@ set_sigdelayed(pid_t pid, int signo, int + for (i = 1; i <= 3; ++i) { + switch (i) { + case 1: +- t = find_task_by_pid(pid); ++ t = find_task_by_pid_ve(pid); + if (t) + start_time = start_time_ul(t); + break; +@@ -682,7 +682,7 @@ do_sigdelayed(void) + siginfo.si_code = current_thread_info()->sigdelayed.code; + siginfo.si_addr = current_thread_info()->sigdelayed.addr; + pid = current_thread_info()->sigdelayed.pid; +- t = find_task_by_pid(pid); ++ t = find_task_by_pid_ve(pid); + if (!t) + return; + if (current_thread_info()->sigdelayed.start_time != start_time_ul(t)) +diff -uprN linux-2.6.15.orig/arch/ia64/mm/contig.c linux-2.6.15-ve025stab014/arch/ia64/mm/contig.c +--- linux-2.6.15.orig/arch/ia64/mm/contig.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ia64/mm/contig.c 2006-01-27 14:48:07.000000000 +0300 +@@ -64,6 +64,7 @@ show_mem (void) + printk("%ld pages in page table cache\n", + pgtable_quicklist_total_size()); + } ++EXPORT_SYMBOL(show_mem); + + /* physical address where the bootmem map is located */ + unsigned long bootmap_start; +diff -uprN linux-2.6.15.orig/arch/ia64/mm/discontig.c linux-2.6.15-ve025stab014/arch/ia64/mm/discontig.c +--- linux-2.6.15.orig/arch/ia64/mm/discontig.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ia64/mm/discontig.c 2006-01-27 14:48:07.000000000 +0300 +@@ -594,6 +594,7 @@ void show_mem(void) + pgtable_quicklist_total_size()); + printk("%d free buffer pages\n", nr_free_buffer_pages()); + } ++EXPORT_SYMBOL(show_mem); + + /** + * call_pernode_memory - use SRAT to call callback functions with node info +diff -uprN linux-2.6.15.orig/arch/ia64/mm/fault.c linux-2.6.15-ve025stab014/arch/ia64/mm/fault.c +--- linux-2.6.15.orig/arch/ia64/mm/fault.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ia64/mm/fault.c 2006-01-27 14:48:05.000000000 +0300 +@@ -116,7 +116,6 @@ ia64_do_page_fault (unsigned long addres + if ((vma->vm_flags & mask) != mask) + goto bad_area; + +- survive: + /* + * If for any reason at all we couldn't handle the fault, make + * sure we exit gracefully rather than endlessly redo the +@@ -241,13 +240,13 @@ ia64_do_page_fault (unsigned long addres + + out_of_memory: + up_read(&mm->mmap_sem); +- if (current->pid == 1) { +- yield(); +- down_read(&mm->mmap_sem); +- goto survive; +- } +- printk(KERN_CRIT "VM: killing process %s\n", current->comm); +- if (user_mode(regs)) +- do_exit(SIGKILL); ++ if (user_mode(regs)) { ++ /* ++ * 0-order allocation always success if something really ++ * fatal not happen: beancounter overdraft or OOM. ++ */ ++ force_sig(SIGKILL, current); ++ return; ++ } + goto no_context; + } +diff -uprN linux-2.6.15.orig/arch/ia64/mm/init.c linux-2.6.15-ve025stab014/arch/ia64/mm/init.c +--- linux-2.6.15.orig/arch/ia64/mm/init.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ia64/mm/init.c 2006-01-27 14:48:05.000000000 +0300 +@@ -37,6 +37,8 @@ + #include <asm/unistd.h> + #include <asm/mca.h> + ++#include <ub/ub_vmpages.h> ++ + DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); + + DEFINE_PER_CPU(unsigned long *, __pgtable_quicklist); +@@ -96,7 +98,7 @@ check_pgt_cache(void) + preempt_disable(); + while (unlikely((pages_to_free = min_pages_to_free()) > 0)) { + while (pages_to_free--) { +- free_page((unsigned long)pgtable_quicklist_alloc()); ++ free_page((unsigned long)pgtable_quicklist_alloc(0)); + } + preempt_enable(); + preempt_disable(); +@@ -146,6 +148,10 @@ ia64_init_addr_space (void) + + ia64_set_rbs_bot(); + ++ if (ub_memory_charge(current->mm, PAGE_SIZE, VM_DATA_DEFAULT_FLAGS, ++ NULL, UB_SOFT)) ++ goto skip; ++ + /* + * If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore + * the problem. When the process attempts to write to the register backing store +@@ -166,8 +172,11 @@ ia64_init_addr_space (void) + return; + } + up_write(¤t->mm->mmap_sem); +- } ++ } else ++ ub_memory_uncharge(current->mm, PAGE_SIZE, ++ VM_DATA_DEFAULT_FLAGS, NULL); + ++skip: + /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */ + if (!(current->personality & MMAP_PAGE_ZERO)) { + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); +diff -uprN linux-2.6.15.orig/arch/powerpc/Kconfig linux-2.6.15-ve025stab014/arch/powerpc/Kconfig +--- linux-2.6.15.orig/arch/powerpc/Kconfig 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/powerpc/Kconfig 2006-01-27 14:48:07.000000000 +0300 +@@ -912,6 +912,8 @@ source "arch/powerpc/platforms/iseries/K + + source "lib/Kconfig" + ++source "kernel/ub/Kconfig" ++ + menu "Instrumentation Support" + depends on EXPERIMENTAL + +@@ -930,6 +932,8 @@ endmenu + + source "arch/powerpc/Kconfig.debug" + ++source "kernel/Kconfig.openvz" ++ + source "security/Kconfig" + + config KEYS_COMPAT +diff -uprN linux-2.6.15.orig/arch/powerpc/kernel/irq.c linux-2.6.15-ve025stab014/arch/powerpc/kernel/irq.c +--- linux-2.6.15.orig/arch/powerpc/kernel/irq.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/powerpc/kernel/irq.c 2006-01-27 14:48:07.000000000 +0300 +@@ -57,6 +57,8 @@ + #include <linux/kallsyms.h> + #endif + ++#include <ub/beancounter.h> ++ + #include <asm/uaccess.h> + #include <asm/system.h> + #include <asm/io.h> +@@ -199,7 +201,11 @@ void fixup_irqs(cpumask_t map) + void do_IRQ(struct pt_regs *regs) + { + struct paca_struct *lpaca; ++ struct ve_struct *ve; ++ struct user_beancounter *ub; + ++ ve = set_exec_env(get_ve0()); ++ ub = set_exec_ub(get_ub0()); + irq_enter(); + + #ifdef CONFIG_DEBUG_STACKOVERFLOW +@@ -228,6 +234,8 @@ void do_IRQ(struct pt_regs *regs) + process_hvlpevents(regs); + + irq_exit(); ++ (void)set_exec_ub(ub); ++ (void)set_exec_env(ve); + + if (lpaca->lppaca.int_dword.fields.decr_int) { + lpaca->lppaca.int_dword.fields.decr_int = 0; +@@ -244,7 +252,11 @@ void do_IRQ(struct pt_regs *regs) + #ifdef CONFIG_IRQSTACKS + struct thread_info *curtp, *irqtp; + #endif ++ struct ve_struct *ve; ++ struct user_beancounter *ub; + ++ ve = set_exec_env(get_ve0()); ++ ub = set_exec_ub(get_ub0()); + irq_enter(); + + #ifdef CONFIG_DEBUG_STACKOVERFLOW +@@ -293,6 +305,8 @@ void do_IRQ(struct pt_regs *regs) + /* That's not SMP safe ... but who cares ? */ + ppc_spurious_interrupts++; + irq_exit(); ++ (void)set_exec_ub(ub); ++ (void)set_exec_env(ve); + } + + #endif /* CONFIG_PPC_ISERIES */ +diff -uprN linux-2.6.15.orig/arch/powerpc/kernel/misc_32.S linux-2.6.15-ve025stab014/arch/powerpc/kernel/misc_32.S +--- linux-2.6.15.orig/arch/powerpc/kernel/misc_32.S 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/powerpc/kernel/misc_32.S 2006-01-27 14:48:07.000000000 +0300 +@@ -967,7 +967,7 @@ _GLOBAL(_get_SP) + * Create a kernel thread + * kernel_thread(fn, arg, flags) + */ +-_GLOBAL(kernel_thread) ++_GLOBAL(ppc_kernel_thread) + stwu r1,-16(r1) + stw r30,8(r1) + stw r31,12(r1) +diff -uprN linux-2.6.15.orig/arch/powerpc/kernel/misc_64.S linux-2.6.15-ve025stab014/arch/powerpc/kernel/misc_64.S +--- linux-2.6.15.orig/arch/powerpc/kernel/misc_64.S 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/powerpc/kernel/misc_64.S 2006-01-27 14:48:07.000000000 +0300 +@@ -677,7 +677,7 @@ _GLOBAL(scom970_write) + * Create a kernel thread + * kernel_thread(fn, arg, flags) + */ +-_GLOBAL(kernel_thread) ++_GLOBAL(ppc_kernel_thread) + std r29,-24(r1) + std r30,-16(r1) + stdu r1,-STACK_FRAME_OVERHEAD(r1) +diff -uprN linux-2.6.15.orig/arch/powerpc/kernel/process.c linux-2.6.15-ve025stab014/arch/powerpc/kernel/process.c +--- linux-2.6.15.orig/arch/powerpc/kernel/process.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/powerpc/kernel/process.c 2006-01-27 14:48:07.000000000 +0300 +@@ -888,3 +888,17 @@ void dump_stack(void) + show_stack(current, NULL); + } + EXPORT_SYMBOL(dump_stack); ++ ++long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) ++{ ++ extern long ppc_kernel_thread(int (*fn)(void *), void *arg, ++ unsigned long flags); ++ ++ if (!ve_is_super(get_exec_env())) { ++ printk("kernel_thread call inside VE\n"); ++ dump_stack(); ++ return -EPERM; ++ } ++ ++ return ppc_kernel_thread(fn, arg, flags); ++} +diff -uprN linux-2.6.15.orig/arch/powerpc/kernel/ptrace32.c linux-2.6.15-ve025stab014/arch/powerpc/kernel/ptrace32.c +--- linux-2.6.15.orig/arch/powerpc/kernel/ptrace32.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/powerpc/kernel/ptrace32.c 2006-01-27 14:48:07.000000000 +0300 +@@ -62,7 +62,7 @@ long compat_sys_ptrace(int request, int + } + ret = -ESRCH; + read_lock(&tasklist_lock); +- child = find_task_by_pid(pid); ++ child = find_task_by_pid_ve(pid); + if (child) + get_task_struct(child); + read_unlock(&tasklist_lock); +diff -uprN linux-2.6.15.orig/arch/powerpc/kernel/syscalls.c linux-2.6.15-ve025stab014/arch/powerpc/kernel/syscalls.c +--- linux-2.6.15.orig/arch/powerpc/kernel/syscalls.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/powerpc/kernel/syscalls.c 2006-01-27 14:48:07.000000000 +0300 +@@ -262,7 +262,7 @@ long ppc_newuname(struct new_utsname __u + int err = 0; + + down_read(&uts_sem); +- if (copy_to_user(name, &system_utsname, sizeof(*name))) ++ if (copy_to_user(name, &ve_utsname, sizeof(*name))) + err = -EFAULT; + up_read(&uts_sem); + if (!err) +@@ -275,7 +275,7 @@ int sys_uname(struct old_utsname __user + int err = 0; + + down_read(&uts_sem); +- if (copy_to_user(name, &system_utsname, sizeof(*name))) ++ if (copy_to_user(name, &ve_utsname, sizeof(*name))) + err = -EFAULT; + up_read(&uts_sem); + if (!err) +@@ -291,19 +291,19 @@ int sys_olduname(struct oldold_utsname _ + return -EFAULT; + + down_read(&uts_sem); +- error = __copy_to_user(&name->sysname, &system_utsname.sysname, ++ error = __copy_to_user(&name->sysname, &ve_utsname.sysname, + __OLD_UTS_LEN); + error |= __put_user(0, name->sysname + __OLD_UTS_LEN); +- error |= __copy_to_user(&name->nodename, &system_utsname.nodename, ++ error |= __copy_to_user(&name->nodename, &ve_utsname.nodename, + __OLD_UTS_LEN); + error |= __put_user(0, name->nodename + __OLD_UTS_LEN); +- error |= __copy_to_user(&name->release, &system_utsname.release, ++ error |= __copy_to_user(&name->release, &ve_utsname.release, + __OLD_UTS_LEN); + error |= __put_user(0, name->release + __OLD_UTS_LEN); +- error |= __copy_to_user(&name->version, &system_utsname.version, ++ error |= __copy_to_user(&name->version, &ve_utsname.version, + __OLD_UTS_LEN); + error |= __put_user(0, name->version + __OLD_UTS_LEN); +- error |= __copy_to_user(&name->machine, &system_utsname.machine, ++ error |= __copy_to_user(&name->machine, &ve_utsname.machine, + __OLD_UTS_LEN); + error |= override_machine(name->machine); + up_read(&uts_sem); +diff -uprN linux-2.6.15.orig/arch/powerpc/kernel/systbl.S linux-2.6.15-ve025stab014/arch/powerpc/kernel/systbl.S +--- linux-2.6.15.orig/arch/powerpc/kernel/systbl.S 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/powerpc/kernel/systbl.S 2006-01-27 14:48:05.000000000 +0300 +@@ -319,3 +319,12 @@ COMPAT_SYS(ioprio_get) + SYSCALL(inotify_init) + SYSCALL(inotify_add_watch) + SYSCALL(inotify_rm_watch) ++ ++.rept 410 - (. - sys_call_table)/8 ++SYSX(sys_ni_syscall, sys_ni_syscall, sys_ni_syscall) ++.endr ++ ++SYSX(sys_getluid, sys_ni_syscall, sys_getluid) ++SYSX(sys_setluid, sys_ni_syscall, sys_setluid) ++SYSX(sys_setublimit, sys_ni_syscall, sys_setublimit) ++SYSX(sys_ubstat, sys_ni_syscall, sys_ubstat) +diff -uprN linux-2.6.15.orig/arch/powerpc/kernel/time.c linux-2.6.15-ve025stab014/arch/powerpc/kernel/time.c +--- linux-2.6.15.orig/arch/powerpc/kernel/time.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/powerpc/kernel/time.c 2006-01-27 14:48:07.000000000 +0300 +@@ -420,12 +420,14 @@ void timer_interrupt(struct pt_regs * re + int next_dec; + int cpu = smp_processor_id(); + unsigned long ticks; ++ struct ve_struct *ve; + + #ifdef CONFIG_PPC32 + if (atomic_read(&ppc_n_lost_interrupts) != 0) + do_IRQ(regs); + #endif + ++ ve = set_exec_env(get_ve0()); + irq_enter(); + + profile_tick(CPU_PROFILING, regs); +@@ -488,6 +490,7 @@ void timer_interrupt(struct pt_regs * re + #endif + + irq_exit(); ++ (void)set_exec_env(ve); + } + + void wakeup_decrementer(void) +diff -uprN linux-2.6.15.orig/arch/powerpc/mm/fault.c linux-2.6.15-ve025stab014/arch/powerpc/mm/fault.c +--- linux-2.6.15.orig/arch/powerpc/mm/fault.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/powerpc/mm/fault.c 2006-01-27 14:48:05.000000000 +0300 +@@ -306,7 +306,6 @@ good_area: + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ +- survive: + switch (handle_mm_fault(mm, vma, address, is_write)) { + + case VM_FAULT_MINOR: +@@ -350,14 +349,12 @@ bad_area_nosemaphore: + */ + out_of_memory: + up_read(&mm->mmap_sem); +- if (current->pid == 1) { +- yield(); +- down_read(&mm->mmap_sem); +- goto survive; +- } +- printk("VM: killing process %s\n", current->comm); + if (user_mode(regs)) +- do_exit(SIGKILL); ++ /* ++ * 0-order allocation always success if something really ++ * fatal not happen: beancounter overdraft or OOM. Den ++ */ ++ force_sig(SIGKILL, current); + return SIGKILL; + + do_sigbus: +diff -uprN linux-2.6.15.orig/arch/powerpc/mm/init_64.c linux-2.6.15-ve025stab014/arch/powerpc/mm/init_64.c +--- linux-2.6.15.orig/arch/powerpc/mm/init_64.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/powerpc/mm/init_64.c 2006-01-27 14:48:05.000000000 +0300 +@@ -225,7 +225,8 @@ void pgtable_cache_init(void) + pgtable_cache[i] = kmem_cache_create(name, + size, size, + SLAB_HWCACHE_ALIGN | +- SLAB_MUST_HWCACHE_ALIGN, ++ SLAB_MUST_HWCACHE_ALIGN | ++ SLAB_UBC | SLAB_NO_CHARGE, + zero_ctor, + NULL); + if (! pgtable_cache[i]) +diff -uprN linux-2.6.15.orig/arch/powerpc/mm/mem.c linux-2.6.15-ve025stab014/arch/powerpc/mm/mem.c +--- linux-2.6.15.orig/arch/powerpc/mm/mem.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/powerpc/mm/mem.c 2006-01-27 14:48:07.000000000 +0300 +@@ -223,6 +223,7 @@ void show_mem(void) + printk("%ld pages shared\n", shared); + printk("%ld pages swap cached\n", cached); + } ++EXPORT_SYMBOL(show_mem); + + /* + * Initialize the bootmem system and give it all the memory we +diff -uprN linux-2.6.15.orig/arch/powerpc/mm/pgtable_32.c linux-2.6.15-ve025stab014/arch/powerpc/mm/pgtable_32.c +--- linux-2.6.15.orig/arch/powerpc/mm/pgtable_32.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/powerpc/mm/pgtable_32.c 2006-01-27 14:48:05.000000000 +0300 +@@ -84,7 +84,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) + { + pgd_t *ret; + +- ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER); ++ ret = (pgd_t *)__get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC | ++ __GFP_ZERO, PGDIR_ORDER); + return ret; + } + +@@ -118,6 +119,7 @@ struct page *pte_alloc_one(struct mm_str + #else + gfp_t flags = GFP_KERNEL | __GFP_REPEAT; + #endif ++ flags |= (__GFP_UBC | __GFP_SOFT_UBC); + + ptepage = alloc_pages(flags, 0); + if (ptepage) +diff -uprN linux-2.6.15.orig/arch/ppc/Kconfig linux-2.6.15-ve025stab014/arch/ppc/Kconfig +--- linux-2.6.15.orig/arch/ppc/Kconfig 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ppc/Kconfig 2006-01-27 14:48:07.000000000 +0300 +@@ -1422,6 +1422,10 @@ source "arch/powerpc/oprofile/Kconfig" + + source "arch/ppc/Kconfig.debug" + ++source "kernel/Kconfig.openvz" ++ + source "security/Kconfig" + ++source "kernel/ub/Kconfig" ++ + source "crypto/Kconfig" +diff -uprN linux-2.6.15.orig/arch/ppc/kernel/misc.S linux-2.6.15-ve025stab014/arch/ppc/kernel/misc.S +--- linux-2.6.15.orig/arch/ppc/kernel/misc.S 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ppc/kernel/misc.S 2006-01-27 14:48:07.000000000 +0300 +@@ -1076,7 +1076,7 @@ _GLOBAL(_get_SP) + * Create a kernel thread + * kernel_thread(fn, arg, flags) + */ +-_GLOBAL(kernel_thread) ++_GLOBAL(ppc_kernel_thread) + stwu r1,-16(r1) + stw r30,8(r1) + stw r31,12(r1) +@@ -1403,3 +1403,12 @@ _GLOBAL(sys_call_table) + .long sys_inotify_init /* 275 */ + .long sys_inotify_add_watch + .long sys_inotify_rm_watch ++ ++ .rept 410-(.-sys_call_table)/4 ++ .long sys_ni_syscall ++ .endr ++ ++ .long sys_getluid /* 410 */ ++ .long sys_setluid ++ .long sys_setublimit ++ .long sys_ubstat +diff -uprN linux-2.6.15.orig/arch/ppc/kernel/process.c linux-2.6.15-ve025stab014/arch/ppc/kernel/process.c +--- linux-2.6.15.orig/arch/ppc/kernel/process.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ppc/kernel/process.c 2006-01-27 14:48:07.000000000 +0300 +@@ -849,3 +849,17 @@ unsigned long get_wchan(struct task_stru + } while (count++ < 16); + return 0; + } ++ ++long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) ++{ ++ extern long ppc_kernel_thread(int (*fn)(void *), void *arg, ++ unsigned long flags); ++ ++ if (!ve_is_super(get_exec_env())) { ++ printk("kernel_thread call inside VE\n"); ++ dump_stack(); ++ return -EPERM; ++ } ++ ++ return ppc_kernel_thread(fn, arg, flags); ++} +diff -uprN linux-2.6.15.orig/arch/ppc/kernel/time.c linux-2.6.15-ve025stab014/arch/ppc/kernel/time.c +--- linux-2.6.15.orig/arch/ppc/kernel/time.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ppc/kernel/time.c 2006-01-27 14:48:07.000000000 +0300 +@@ -58,6 +58,8 @@ + #include <linux/init.h> + #include <linux/profile.h> + ++#include <ub/beancounter.h> ++ + #include <asm/io.h> + #include <asm/nvram.h> + #include <asm/cache.h> +@@ -136,10 +138,14 @@ void timer_interrupt(struct pt_regs * re + unsigned long cpu = smp_processor_id(); + unsigned jiffy_stamp = last_jiffy_stamp(cpu); + extern void do_IRQ(struct pt_regs *); ++ struct ve_struct *ve; ++ struct user_beancounter *ub; + + if (atomic_read(&ppc_n_lost_interrupts) != 0) + do_IRQ(regs); + ++ ve = set_exec_env(get_ve0()); ++ ub = set_exec_ub(get_ub0()); + irq_enter(); + + while ((next_dec = tb_ticks_per_jiffy - tb_delta(&jiffy_stamp)) <= 0) { +@@ -192,6 +198,8 @@ void timer_interrupt(struct pt_regs * re + ppc_md.heartbeat(); + + irq_exit(); ++ (void)set_exec_ub(ub); ++ (void)set_exec_env(ve); + } + + /* +diff -uprN linux-2.6.15.orig/arch/ppc/mm/fault.c linux-2.6.15-ve025stab014/arch/ppc/mm/fault.c +--- linux-2.6.15.orig/arch/ppc/mm/fault.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ppc/mm/fault.c 2006-01-27 14:48:05.000000000 +0300 +@@ -247,7 +247,6 @@ good_area: + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ +- survive: + switch (handle_mm_fault(mm, vma, address, is_write)) { + case VM_FAULT_MINOR: + current->min_flt++; +@@ -290,14 +289,12 @@ bad_area: + */ + out_of_memory: + up_read(&mm->mmap_sem); +- if (current->pid == 1) { +- yield(); +- down_read(&mm->mmap_sem); +- goto survive; +- } +- printk("VM: killing process %s\n", current->comm); + if (user_mode(regs)) +- do_exit(SIGKILL); ++ /* ++ * 0-order allocation always success if something really ++ * fatal not happen: beancounter overdraft or OOM. Den ++ */ ++ force_sig(SIGKILL, current); + return SIGKILL; + + do_sigbus: +diff -uprN linux-2.6.15.orig/arch/ppc/mm/init.c linux-2.6.15-ve025stab014/arch/ppc/mm/init.c +--- linux-2.6.15.orig/arch/ppc/mm/init.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ppc/mm/init.c 2006-01-27 14:48:07.000000000 +0300 +@@ -136,6 +136,7 @@ void show_mem(void) + printk("%d pages shared\n",shared); + printk("%d pages swap cached\n",cached); + } ++EXPORT_SYMBOL(show_mem); + + /* Free up now-unused memory */ + static void free_sec(unsigned long start, unsigned long end, const char *name) +diff -uprN linux-2.6.15.orig/arch/ppc/mm/pgtable.c linux-2.6.15-ve025stab014/arch/ppc/mm/pgtable.c +--- linux-2.6.15.orig/arch/ppc/mm/pgtable.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/ppc/mm/pgtable.c 2006-01-27 14:48:05.000000000 +0300 +@@ -84,7 +84,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) + { + pgd_t *ret; + +- ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER); ++ ret = (pgd_t *)__get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC | ++ __GFP_ZERO, PGDIR_ORDER); + return ret; + } + +@@ -118,6 +119,7 @@ struct page *pte_alloc_one(struct mm_str + #else + gfp_t flags = GFP_KERNEL | __GFP_REPEAT; + #endif ++ flags |= (__GFP_UBC | __GFP_SOFT_UBC); + + ptepage = alloc_pages(flags, 0); + if (ptepage) +diff -uprN linux-2.6.15.orig/arch/s390/Kconfig linux-2.6.15-ve025stab014/arch/s390/Kconfig +--- linux-2.6.15.orig/arch/s390/Kconfig 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/s390/Kconfig 2006-01-27 14:48:07.000000000 +0300 +@@ -485,8 +485,14 @@ source "arch/s390/oprofile/Kconfig" + + source "arch/s390/Kconfig.debug" + ++menu "OpenVZ" ++source "kernel/Kconfig.openvz" ++endmenu ++ + source "security/Kconfig" + + source "crypto/Kconfig" + + source "lib/Kconfig" ++ ++source "kernel/ub/Kconfig" +diff -uprN linux-2.6.15.orig/arch/s390/kernel/process.c linux-2.6.15-ve025stab014/arch/s390/kernel/process.c +--- linux-2.6.15.orig/arch/s390/kernel/process.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/s390/kernel/process.c 2006-01-27 14:48:07.000000000 +0300 +@@ -154,9 +154,10 @@ void show_regs(struct pt_regs *regs) + struct task_struct *tsk = current; + + printk("CPU: %d %s\n", tsk->thread_info->cpu, print_tainted()); +- printk("Process %s (pid: %d, task: %p, ksp: %p)\n", +- current->comm, current->pid, (void *) tsk, +- (void *) tsk->thread.ksp); ++ printk("Process %s (pid: %d, veid: %d, task: %p, ksp: %p)\n", ++ current->comm, current->pid, ++ VEID(VE_TASK_INFO(current)->owner_env), ++ (void *) tsk, (void *) tsk->thread.ksp); + + show_registers(regs); + /* Show stack backtrace if pt_regs is from kernel mode */ +@@ -177,6 +178,13 @@ int kernel_thread(int (*fn)(void *), voi + { + struct pt_regs regs; + ++ if (!ve_is_super(get_exec_env())) { ++ /* Don't allow kernel_thread() inside VE */ ++ printk("kernel_thread call inside VE\n"); ++ dump_stack(); ++ return -EPERM; ++ } ++ + memset(®s, 0, sizeof(regs)); + regs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_IO | PSW_MASK_EXT; + regs.psw.addr = (unsigned long) kernel_thread_starter | PSW_ADDR_AMODE; +diff -uprN linux-2.6.15.orig/arch/s390/kernel/ptrace.c linux-2.6.15-ve025stab014/arch/s390/kernel/ptrace.c +--- linux-2.6.15.orig/arch/s390/kernel/ptrace.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/s390/kernel/ptrace.c 2006-01-27 14:48:07.000000000 +0300 +@@ -732,7 +732,7 @@ sys_ptrace(long request, long pid, long + + ret = -ESRCH; + read_lock(&tasklist_lock); +- child = find_task_by_pid(pid); ++ child = find_task_by_pid_ve(pid); + if (child) + get_task_struct(child); + read_unlock(&tasklist_lock); +diff -uprN linux-2.6.15.orig/arch/s390/kernel/s390_ext.c linux-2.6.15-ve025stab014/arch/s390/kernel/s390_ext.c +--- linux-2.6.15.orig/arch/s390/kernel/s390_ext.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/s390/kernel/s390_ext.c 2006-01-27 14:48:07.000000000 +0300 +@@ -114,7 +114,9 @@ void do_extint(struct pt_regs *regs, uns + { + ext_int_info_t *p; + int index; ++ struct ve_struct *envid; + ++ envid = set_exec_env(get_ve0()); + irq_enter(); + asm volatile ("mc 0,0"); + if (S390_lowcore.int_clock >= S390_lowcore.jiffy_timer) +@@ -132,6 +134,7 @@ void do_extint(struct pt_regs *regs, uns + } + } + irq_exit(); ++ (void)set_exec_env(envid); + } + + EXPORT_SYMBOL(register_external_interrupt); +diff -uprN linux-2.6.15.orig/arch/s390/kernel/smp.c linux-2.6.15-ve025stab014/arch/s390/kernel/smp.c +--- linux-2.6.15.orig/arch/s390/kernel/smp.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/s390/kernel/smp.c 2006-01-27 14:48:07.000000000 +0300 +@@ -533,6 +533,17 @@ int __devinit start_secondary(void *cpuv + { + /* Setup the cpu */ + cpu_init(); ++ ++#ifdef CONFIG_VE ++ /* TSC reset. kill whatever might rely on old values */ ++ VE_TASK_INFO(current)->wakeup_stamp = 0; ++ /* ++ * Cosmetic: sleep_time won't be changed afterwards for the idle ++ * thread; keep it 0 rather than -cycles. ++ */ ++ VE_TASK_INFO(idle)->sleep_time = 0; ++#endif ++ + preempt_disable(); + /* init per CPU timer */ + init_cpu_timer(); +@@ -802,6 +813,11 @@ void __init smp_prepare_cpus(unsigned in + for_each_cpu(cpu) + if (cpu != smp_processor_id()) + smp_create_idle(cpu); ++ ++#ifdef CONFIG_VE ++ /* TSC reset. kill whatever might rely on old values */ ++ VE_TASK_INFO(current)->wakeup_stamp = 0; ++#endif + } + + void __devinit smp_prepare_boot_cpu(void) +diff -uprN linux-2.6.15.orig/arch/s390/kernel/syscalls.S linux-2.6.15-ve025stab014/arch/s390/kernel/syscalls.S +--- linux-2.6.15.orig/arch/s390/kernel/syscalls.S 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/s390/kernel/syscalls.S 2006-01-27 14:48:05.000000000 +0300 +@@ -295,3 +295,12 @@ SYSCALL(sys_ioprio_get,sys_ioprio_get,sy + SYSCALL(sys_inotify_init,sys_inotify_init,sys_inotify_init) + SYSCALL(sys_inotify_add_watch,sys_inotify_add_watch,sys_inotify_add_watch_wrapper) + SYSCALL(sys_inotify_rm_watch,sys_inotify_rm_watch,sys_inotify_rm_watch_wrapper) ++ ++.rept 410-(.-sys_call_table)/4 ++ NI_SYSCALL ++.endr ++ ++SYSCALL(sys_getluid, sys_getluid, sys_ni_syscall) /* 410 */ ++SYSCALL(sys_setluid, sys_setluid, sys_ni_syscall) ++SYSCALL(sys_setublimit, sys_setublimit, sys_ni_syscall) ++SYSCALL(sys_ubstat, sys_ubstat, sys_ni_syscall) +diff -uprN linux-2.6.15.orig/arch/s390/mm/init.c linux-2.6.15-ve025stab014/arch/s390/mm/init.c +--- linux-2.6.15.orig/arch/s390/mm/init.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/s390/mm/init.c 2006-01-27 14:48:07.000000000 +0300 +@@ -89,6 +89,7 @@ void show_mem(void) + printk("%d pages shared\n",shared); + printk("%d pages swap cached\n",cached); + } ++EXPORT_SYMBOL(show_mem); + + /* References to section boundaries */ + +diff -uprN linux-2.6.15.orig/arch/um/drivers/mconsole_kern.c linux-2.6.15-ve025stab014/arch/um/drivers/mconsole_kern.c +--- linux-2.6.15.orig/arch/um/drivers/mconsole_kern.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/um/drivers/mconsole_kern.c 2006-01-27 14:48:07.000000000 +0300 +@@ -509,7 +509,7 @@ void do_stack(struct mc_request *req) + } + + from = current; +- to = find_task_by_pid(pid_requested); ++ to = find_task_by_pid_all(pid_requested); + + if((to == NULL) || (pid_requested == 0)) { + mconsole_reply(req, "Couldn't find that pid", 1, 0); +diff -uprN linux-2.6.15.orig/arch/um/kernel/skas/process_kern.c linux-2.6.15-ve025stab014/arch/um/kernel/skas/process_kern.c +--- linux-2.6.15.orig/arch/um/kernel/skas/process_kern.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/um/kernel/skas/process_kern.c 2006-01-27 14:48:07.000000000 +0300 +@@ -213,7 +213,7 @@ void kill_off_processes_skas(void) + int pid, me; + + me = os_getpid(); +- for_each_process(p){ ++ for_each_process_all(p){ + if(p->mm == NULL) + continue; + +diff -uprN linux-2.6.15.orig/arch/um/kernel/tt/process_kern.c linux-2.6.15-ve025stab014/arch/um/kernel/tt/process_kern.c +--- linux-2.6.15.orig/arch/um/kernel/tt/process_kern.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/um/kernel/tt/process_kern.c 2006-01-27 14:48:07.000000000 +0300 +@@ -303,7 +303,7 @@ void kill_off_processes_tt(void) + int me; + + me = os_getpid(); +- for_each_process(p){ ++ for_each_process_all(p){ + if(p->thread.mode.tt.extern_pid != me) + os_kill_process(p->thread.mode.tt.extern_pid, 0); + } +@@ -446,7 +446,7 @@ int is_valid_pid(int pid) + struct task_struct *task; + + read_lock(&tasklist_lock); +- for_each_process(task){ ++ for_each_process_all(task){ + if(task->thread.mode.tt.extern_pid == pid){ + read_unlock(&tasklist_lock); + return(1); +diff -uprN linux-2.6.15.orig/arch/x86_64/Kconfig linux-2.6.15-ve025stab014/arch/x86_64/Kconfig +--- linux-2.6.15.orig/arch/x86_64/Kconfig 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/x86_64/Kconfig 2006-01-27 14:48:07.000000000 +0300 +@@ -574,8 +574,14 @@ endmenu + + source "arch/x86_64/Kconfig.debug" + ++menu "OpenVZ" ++source "kernel/Kconfig.openvz" ++endmenu ++ + source "security/Kconfig" + + source "crypto/Kconfig" + + source "lib/Kconfig" ++ ++source "kernel/ub/Kconfig" +diff -uprN linux-2.6.15.orig/arch/x86_64/ia32/ia32_aout.c linux-2.6.15-ve025stab014/arch/x86_64/ia32/ia32_aout.c +--- linux-2.6.15.orig/arch/x86_64/ia32/ia32_aout.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/x86_64/ia32/ia32_aout.c 2006-01-27 14:48:07.000000000 +0300 +@@ -347,14 +347,14 @@ static int load_aout_binary(struct linux + if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && + (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ) + { +- printk(KERN_NOTICE "executable not page aligned\n"); ++ ve_printk(VE_LOG, KERN_NOTICE "executable not page aligned\n"); + error_time2 = jiffies; + } + + if ((fd_offset & ~PAGE_MASK) != 0 && + (jiffies-error_time) > 5*HZ) + { +- printk(KERN_WARNING ++ ve_printk(VE_LOG, KERN_WARNING + "fd_offset is not page aligned. Please convert program: %s\n", + bprm->file->f_dentry->d_name.name); + error_time = jiffies; +@@ -467,7 +467,7 @@ static int load_aout_library(struct file + static unsigned long error_time; + if ((jiffies-error_time) > 5*HZ) + { +- printk(KERN_WARNING ++ ve_printk(VE_LOG, KERN_WARNING + "N_TXTOFF is not page aligned. Please convert library: %s\n", + file->f_dentry->d_name.name); + error_time = jiffies; +diff -uprN linux-2.6.15.orig/arch/x86_64/ia32/ia32_binfmt.c linux-2.6.15-ve025stab014/arch/x86_64/ia32/ia32_binfmt.c +--- linux-2.6.15.orig/arch/x86_64/ia32/ia32_binfmt.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/x86_64/ia32/ia32_binfmt.c 2006-01-27 14:48:05.000000000 +0300 +@@ -27,6 +27,8 @@ + #include <asm/ia32.h> + #include <asm/vsyscall32.h> + ++#include <ub/ub_vmpages.h> ++ + #define ELF_NAME "elf/i386" + + #define AT_SYSINFO 32 +@@ -350,9 +352,15 @@ int ia32_setup_arg_pages(struct linux_bi + bprm->loader += stack_base; + bprm->exec += stack_base; + ++ ret = -ENOMEM; ++ if (ub_memory_charge(mm, IA32_STACK_TOP - ++ (PAGE_MASK & (unsigned long)bprm->p), ++ VM_STACK_FLAGS, NULL, UB_SOFT)) ++ goto err_charge; ++ + mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!mpnt) +- return -ENOMEM; ++ goto err_alloc; + + memset(mpnt, 0, sizeof(*mpnt)); + +@@ -369,11 +377,8 @@ int ia32_setup_arg_pages(struct linux_bi + mpnt->vm_flags = VM_STACK_FLAGS; + mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ? + PAGE_COPY_EXEC : PAGE_COPY; +- if ((ret = insert_vm_struct(mm, mpnt))) { +- up_write(&mm->mmap_sem); +- kmem_cache_free(vm_area_cachep, mpnt); +- return ret; +- } ++ if ((ret = insert_vm_struct(mm, mpnt))) ++ goto err_insert; + mm->stack_vm = mm->total_vm = vma_pages(mpnt); + } + +@@ -388,6 +393,16 @@ int ia32_setup_arg_pages(struct linux_bi + up_write(&mm->mmap_sem); + + return 0; ++ ++err_insert: ++ up_write(&mm->mmap_sem); ++ kmem_cache_free(vm_area_cachep, mpnt); ++err_alloc: ++ ub_memory_uncharge(mm, IA32_STACK_TOP - ++ (PAGE_MASK & (unsigned long)bprm->p), ++ VM_STACK_FLAGS, NULL); ++err_charge: ++ return ret; + } + EXPORT_SYMBOL(ia32_setup_arg_pages); + +diff -uprN linux-2.6.15.orig/arch/x86_64/ia32/ptrace32.c linux-2.6.15-ve025stab014/arch/x86_64/ia32/ptrace32.c +--- linux-2.6.15.orig/arch/x86_64/ia32/ptrace32.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/x86_64/ia32/ptrace32.c 2006-01-27 14:48:07.000000000 +0300 +@@ -206,7 +206,7 @@ static struct task_struct *find_target(i + + *err = -ESRCH; + read_lock(&tasklist_lock); +- child = find_task_by_pid(pid); ++ child = find_task_by_pid_ve(pid); + if (child) + get_task_struct(child); + read_unlock(&tasklist_lock); +diff -uprN linux-2.6.15.orig/arch/x86_64/ia32/sys_ia32.c linux-2.6.15-ve025stab014/arch/x86_64/ia32/sys_ia32.c +--- linux-2.6.15.orig/arch/x86_64/ia32/sys_ia32.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/x86_64/ia32/sys_ia32.c 2006-01-27 14:48:07.000000000 +0300 +@@ -505,7 +505,7 @@ int sys32_ni_syscall(int call) + static char lastcomm[sizeof(me->comm)]; + + if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { +- printk(KERN_INFO "IA32 syscall %d from %s not implemented\n", ++ ve_printk(VE_LOG, KERN_INFO "IA32 syscall %d from %s not implemented\n", + call, me->comm); + strncpy(lastcomm, me->comm, sizeof(lastcomm)); + } +@@ -868,13 +868,13 @@ asmlinkage long sys32_olduname(struct ol + + down_read(&uts_sem); + +- error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); ++ error = __copy_to_user(&name->sysname,&ve_utsname.sysname,__OLD_UTS_LEN); + __put_user(0,name->sysname+__OLD_UTS_LEN); +- __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); ++ __copy_to_user(&name->nodename,&ve_utsname.nodename,__OLD_UTS_LEN); + __put_user(0,name->nodename+__OLD_UTS_LEN); +- __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); ++ __copy_to_user(&name->release,&ve_utsname.release,__OLD_UTS_LEN); + __put_user(0,name->release+__OLD_UTS_LEN); +- __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); ++ __copy_to_user(&name->version,&ve_utsname.version,__OLD_UTS_LEN); + __put_user(0,name->version+__OLD_UTS_LEN); + { + char *arch = "x86_64"; +@@ -897,7 +897,7 @@ long sys32_uname(struct old_utsname __us + if (!name) + return -EFAULT; + down_read(&uts_sem); +- err=copy_to_user(name, &system_utsname, sizeof (*name)); ++ err=copy_to_user(name, &ve_utsname, sizeof (*name)); + up_read(&uts_sem); + if (personality(current->personality) == PER_LINUX32) + err |= copy_to_user(&name->machine, "i686", 5); +@@ -1002,7 +1002,7 @@ long sys32_vm86_warning(void) + struct task_struct *me = current; + static char lastcomm[sizeof(me->comm)]; + if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { +- printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", ++ ve_printk(VE_LOG, KERN_INFO "%s: vm87 mode not supported on 64 bit kernel\n", + me->comm); + strncpy(lastcomm, me->comm, sizeof(lastcomm)); + } +diff -uprN linux-2.6.15.orig/arch/x86_64/ia32/syscall32.c linux-2.6.15-ve025stab014/arch/x86_64/ia32/syscall32.c +--- linux-2.6.15.orig/arch/x86_64/ia32/syscall32.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/x86_64/ia32/syscall32.c 2006-01-27 14:48:05.000000000 +0300 +@@ -14,6 +14,8 @@ + #include <asm/tlbflush.h> + #include <asm/ia32_unistd.h> + ++#include <ub/ub_vmpages.h> ++ + extern unsigned char syscall32_syscall[], syscall32_syscall_end[]; + extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[]; + extern int sysctl_vsyscall32; +@@ -47,32 +49,45 @@ int syscall32_setup_pages(struct linux_b + int npages = (VSYSCALL32_END - VSYSCALL32_BASE) >> PAGE_SHIFT; + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; ++ unsigned long flags; + int ret; + ++ flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | VM_MAYWRITE | ++ mm->def_flags; ++ ++ ret = -ENOMEM; ++ if (ub_memory_charge(mm, VSYSCALL32_END - VSYSCALL32_BASE, ++ flags, NULL, UB_SOFT)) ++ goto err_charge; ++ + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) +- return -ENOMEM; ++ goto err_alloc; + + memset(vma, 0, sizeof(struct vm_area_struct)); + /* Could randomize here */ + vma->vm_start = VSYSCALL32_BASE; + vma->vm_end = VSYSCALL32_END; + /* MAYWRITE to allow gdb to COW and set breakpoints */ +- vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE; +- vma->vm_flags |= mm->def_flags; ++ vma->vm_flags = flags; + vma->vm_page_prot = protection_map[vma->vm_flags & 7]; + vma->vm_ops = &syscall32_vm_ops; + vma->vm_mm = mm; + + down_write(&mm->mmap_sem); +- if ((ret = insert_vm_struct(mm, vma))) { +- up_write(&mm->mmap_sem); +- kmem_cache_free(vm_area_cachep, vma); +- return ret; +- } ++ if ((ret = insert_vm_struct(mm, vma))) ++ goto err_ins; + mm->total_vm += npages; + up_write(&mm->mmap_sem); + return 0; ++ ++err_ins: ++ up_write(&mm->mmap_sem); ++ kmem_cache_free(vm_area_cachep, vma); ++err_alloc: ++ ub_memory_uncharge(mm, VSYSCALL32_END - VSYSCALL32_BASE, flags, NULL); ++err_charge: ++ return ret; + } + + static int __init init_syscall32(void) +diff -uprN linux-2.6.15.orig/arch/x86_64/kernel/apic.c linux-2.6.15-ve025stab014/arch/x86_64/kernel/apic.c +--- linux-2.6.15.orig/arch/x86_64/kernel/apic.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/x86_64/kernel/apic.c 2006-01-27 14:48:07.000000000 +0300 +@@ -905,6 +905,7 @@ void smp_local_timer_interrupt(struct pt + */ + void smp_apic_timer_interrupt(struct pt_regs *regs) + { ++ struct ve_struct *ve; + /* + * the NMI deadlock-detector uses this. + */ +@@ -920,9 +921,11 @@ void smp_apic_timer_interrupt(struct pt_ + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ ++ ve = set_exec_env(get_ve0()); + irq_enter(); + smp_local_timer_interrupt(regs); + irq_exit(); ++ (void)set_exec_env(ve); + } + + /* +diff -uprN linux-2.6.15.orig/arch/x86_64/kernel/entry.S linux-2.6.15-ve025stab014/arch/x86_64/kernel/entry.S +--- linux-2.6.15.orig/arch/x86_64/kernel/entry.S 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/x86_64/kernel/entry.S 2006-01-27 14:48:07.000000000 +0300 +@@ -828,7 +828,7 @@ ENTRY(kernel_thread) + xorl %r9d,%r9d + + # clone now +- call do_fork ++ call do_fork_kthread + movq %rax,RAX(%rsp) + xorl %edi,%edi + +diff -uprN linux-2.6.15.orig/arch/x86_64/kernel/irq.c linux-2.6.15-ve025stab014/arch/x86_64/kernel/irq.c +--- linux-2.6.15.orig/arch/x86_64/kernel/irq.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/x86_64/kernel/irq.c 2006-01-27 14:48:07.000000000 +0300 +@@ -97,12 +97,14 @@ asmlinkage unsigned int do_IRQ(struct pt + { + /* high bits used in ret_from_ code */ + unsigned irq = regs->orig_rax & 0xff; +- ++ struct ve_struct *ve; ++ ++ ve = set_exec_env(get_ve0()); + irq_enter(); + + __do_IRQ(irq, regs); + irq_exit(); +- ++ (void)set_exec_env(ve); + return 1; + } + +diff -uprN linux-2.6.15.orig/arch/x86_64/kernel/ldt.c linux-2.6.15-ve025stab014/arch/x86_64/kernel/ldt.c +--- linux-2.6.15.orig/arch/x86_64/kernel/ldt.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/x86_64/kernel/ldt.c 2006-01-27 14:48:05.000000000 +0300 +@@ -23,6 +23,8 @@ + #include <asm/desc.h> + #include <asm/proto.h> + ++#include <ub/ub_mem.h> ++ + #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ + static void flush_ldt(void *null) + { +@@ -42,9 +44,9 @@ static int alloc_ldt(mm_context_t *pc, u + oldsize = pc->size; + mincount = (mincount+511)&(~511); + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) +- newldt = vmalloc(mincount*LDT_ENTRY_SIZE); ++ newldt = ub_vmalloc(mincount*LDT_ENTRY_SIZE); + else +- newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); ++ newldt = ub_kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; +diff -uprN linux-2.6.15.orig/arch/x86_64/kernel/process.c linux-2.6.15-ve025stab014/arch/x86_64/kernel/process.c +--- linux-2.6.15.orig/arch/x86_64/kernel/process.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/x86_64/kernel/process.c 2006-01-27 14:48:07.000000000 +0300 +@@ -812,3 +812,20 @@ unsigned long arch_align_stack(unsigned + sp -= get_random_int() % 8192; + return sp & ~0xf; + } ++ ++long do_fork_kthread(unsigned long clone_flags, ++ unsigned long stack_start, ++ struct pt_regs *regs, ++ unsigned long stack_size, ++ int __user *parent_tidptr, ++ int __user *child_tidptr) ++{ ++ if (ve_is_super(get_exec_env())) ++ return do_fork(clone_flags, stack_start, regs, stack_size, ++ parent_tidptr, child_tidptr); ++ ++ /* Don't allow kernel_thread() inside VE */ ++ printk("kernel_thread call inside VE\n"); ++ dump_stack(); ++ return -EPERM; ++} +diff -uprN linux-2.6.15.orig/arch/x86_64/kernel/sys_x86_64.c linux-2.6.15-ve025stab014/arch/x86_64/kernel/sys_x86_64.c +--- linux-2.6.15.orig/arch/x86_64/kernel/sys_x86_64.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/x86_64/kernel/sys_x86_64.c 2006-01-27 14:48:07.000000000 +0300 +@@ -148,7 +148,7 @@ asmlinkage long sys_uname(struct new_uts + { + int err; + down_read(&uts_sem); +- err = copy_to_user(name, &system_utsname, sizeof (*name)); ++ err = copy_to_user(name, &ve_utsname, sizeof (*name)); + up_read(&uts_sem); + if (personality(current->personality) == PER_LINUX32) + err |= copy_to_user(&name->machine, "i686", 5); +diff -uprN linux-2.6.15.orig/arch/x86_64/kernel/time.c linux-2.6.15-ve025stab014/arch/x86_64/kernel/time.c +--- linux-2.6.15.orig/arch/x86_64/kernel/time.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/x86_64/kernel/time.c 2006-01-27 14:48:07.000000000 +0300 +@@ -64,6 +64,8 @@ unsigned long vxtime_hz = PIT_TICK_RATE; + int report_lost_ticks; /* command line option */ + unsigned long long monotonic_base; + ++EXPORT_SYMBOL(cpu_khz); ++ + struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ + + volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +diff -uprN linux-2.6.15.orig/arch/x86_64/kernel/traps.c linux-2.6.15-ve025stab014/arch/x86_64/kernel/traps.c +--- linux-2.6.15.orig/arch/x86_64/kernel/traps.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/x86_64/kernel/traps.c 2006-01-27 14:48:07.000000000 +0300 +@@ -279,10 +279,12 @@ void show_registers(struct pt_regs *regs + + rsp = regs->rsp; + +- printk("CPU %d ", cpu); ++ printk("CPU: %d ", cpu); + __show_regs(regs); +- printk("Process %s (pid: %d, threadinfo %p, task %p)\n", +- cur->comm, cur->pid, cur->thread_info, cur); ++ printk("Process %s (pid: %d, veid=%d, threadinfo %p, task %p)\n", ++ cur->comm, cur->pid, ++ VEID(VE_TASK_INFO(current)->owner_env), ++ cur->thread_info, cur); + + /* + * When in-kernel, we also print out the stack and code at the +diff -uprN linux-2.6.15.orig/arch/x86_64/mm/fault.c linux-2.6.15-ve025stab014/arch/x86_64/mm/fault.c +--- linux-2.6.15.orig/arch/x86_64/mm/fault.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/x86_64/mm/fault.c 2006-01-27 14:48:07.000000000 +0300 +@@ -318,7 +318,7 @@ asmlinkage void __kprobes do_page_fault( + local_irq_enable(); + + if (unlikely(page_fault_trace)) +- printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", ++ ve_printk(VE_LOG, "pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", + regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); + + tsk = current; +@@ -364,7 +364,6 @@ asmlinkage void __kprobes do_page_fault( + if (unlikely(in_atomic() || !mm)) + goto bad_area_nosemaphore; + +- again: + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunatly, in the case of an +@@ -468,7 +467,7 @@ bad_area_nosemaphore: + return; + + if (exception_trace && unhandled_signal(tsk, SIGSEGV)) { +- printk( ++ ve_printk(VE_LOG, + "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", + tsk->pid > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, tsk->pid, address, regs->rip, +@@ -533,13 +532,14 @@ no_context: + */ + out_of_memory: + up_read(&mm->mmap_sem); +- if (current->pid == 1) { +- yield(); +- goto again; +- } +- printk("VM: killing process %s\n", tsk->comm); +- if (error_code & 4) +- do_exit(SIGKILL); ++ if (error_code & 4) { ++ /* ++ * 0-order allocation always success if something really ++ * fatal not happen: beancounter overdraft or OOM. ++ */ ++ force_sig(SIGKILL, tsk); ++ return; ++ } + goto no_context; + + do_sigbus: +diff -uprN linux-2.6.15.orig/arch/x86_64/mm/init.c linux-2.6.15-ve025stab014/arch/x86_64/mm/init.c +--- linux-2.6.15.orig/arch/x86_64/mm/init.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/arch/x86_64/mm/init.c 2006-01-27 14:48:07.000000000 +0300 +@@ -81,6 +81,7 @@ void show_mem(void) + printk(KERN_INFO "%lu pages shared\n",shared); + printk(KERN_INFO "%lu pages swap cached\n",cached); + } ++EXPORT_SYMBOL(show_mem); + + /* References to section boundaries */ + +diff -uprN linux-2.6.15.orig/block/elevator.c linux-2.6.15-ve025stab014/block/elevator.c +--- linux-2.6.15.orig/block/elevator.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/block/elevator.c 2006-01-27 14:48:07.000000000 +0300 +@@ -646,7 +646,7 @@ void elv_unregister(struct elevator_type + * Iterate every thread in the process to remove the io contexts. + */ + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + struct io_context *ioc = p->io_context; + if (ioc && ioc->cic) { + ioc->cic->exit(ioc->cic); +@@ -658,7 +658,7 @@ void elv_unregister(struct elevator_type + ioc->aic->dtor(ioc->aic); + ioc->aic = NULL; + } +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + read_unlock(&tasklist_lock); + + spin_lock_irq(&elv_list_lock); +diff -uprN linux-2.6.15.orig/block/genhd.c linux-2.6.15-ve025stab014/block/genhd.c +--- linux-2.6.15.orig/block/genhd.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/block/genhd.c 2006-01-27 14:48:07.000000000 +0300 +@@ -18,7 +18,7 @@ + + #define MAX_PROBE_HASH 255 /* random */ + +-static struct subsystem block_subsys; ++struct subsystem block_subsys; + + static DECLARE_MUTEX(block_subsys_sem); + +@@ -526,7 +526,8 @@ static struct kset_hotplug_ops block_hot + }; + + /* declare block_subsys. */ +-static decl_subsys(block, &ktype_block, &block_hotplug_ops); ++decl_subsys(block, &ktype_block, &block_hotplug_ops); ++EXPORT_SYMBOL(block_subsys); + + + /* +diff -uprN linux-2.6.15.orig/drivers/base/class.c linux-2.6.15-ve025stab014/drivers/base/class.c +--- linux-2.6.15.orig/drivers/base/class.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/drivers/base/class.c 2006-01-27 14:48:07.000000000 +0300 +@@ -74,6 +74,11 @@ static struct kobj_type ktype_class = { + /* Hotplug events for classes go to the class_obj subsys */ + static decl_subsys(class, &ktype_class, NULL); + ++#ifndef CONFIG_VE ++#define visible_class_subsys class_subsys ++#else ++#define visible_class_subsys (*get_exec_env()->class_subsys) ++#endif + + int class_create_file(struct class * cls, const struct class_attribute * attr) + { +@@ -148,7 +153,7 @@ int class_register(struct class * cls) + if (error) + return error; + +- subsys_set_kset(cls, class_subsys); ++ subsys_set_kset(cls, visible_class_subsys); + + error = subsystem_register(&cls->subsys); + if (!error) { +@@ -422,6 +427,11 @@ static struct kset_hotplug_ops class_hot + + static decl_subsys(class_obj, &ktype_class_device, &class_hotplug_ops); + ++#ifndef CONFIG_VE ++#define visible_class_obj_subsys class_obj_subsys ++#else ++#define visible_class_obj_subsys (*get_exec_env()->class_obj_subsys) ++#endif + + static int class_device_add_attrs(struct class_device * cd) + { +@@ -470,7 +480,7 @@ static ssize_t store_uevent(struct class + + void class_device_initialize(struct class_device *class_dev) + { +- kobj_set_kset_s(class_dev, class_obj_subsys); ++ kobj_set_kset_s(class_dev, visible_class_obj_subsys); + kobject_init(&class_dev->kobj); + INIT_LIST_HEAD(&class_dev->node); + } +@@ -805,12 +815,19 @@ void class_interface_unregister(struct c + class_put(parent); + } + +- ++void prepare_sysfs_classes(void) ++{ ++#ifdef CONFIG_VE ++ get_ve0()->class_subsys = &class_subsys; ++ get_ve0()->class_obj_subsys = &class_obj_subsys; ++#endif ++} + + int __init classes_init(void) + { + int retval; + ++ prepare_sysfs_classes(); + retval = subsystem_register(&class_subsys); + if (retval) + return retval; +@@ -848,3 +865,6 @@ EXPORT_SYMBOL_GPL(class_device_remove_bi + + EXPORT_SYMBOL_GPL(class_interface_register); + EXPORT_SYMBOL_GPL(class_interface_unregister); ++ ++EXPORT_SYMBOL(class_subsys); ++EXPORT_SYMBOL(class_obj_subsys); +diff -uprN linux-2.6.15.orig/drivers/char/pty.c linux-2.6.15-ve025stab014/drivers/char/pty.c +--- linux-2.6.15.orig/drivers/char/pty.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/drivers/char/pty.c 2006-01-27 14:48:07.000000000 +0300 +@@ -32,16 +32,30 @@ + #include <linux/bitops.h> + #include <linux/devpts_fs.h> + ++#include <ub/ub_misc.h> ++ + /* These are global because they are accessed in tty_io.c */ + #ifdef CONFIG_UNIX98_PTYS + struct tty_driver *ptm_driver; +-static struct tty_driver *pts_driver; ++struct tty_driver *pts_driver; ++EXPORT_SYMBOL(ptm_driver); ++EXPORT_SYMBOL(pts_driver); ++ ++void prepare_pty(void) ++{ ++#ifdef CONFIG_VE ++ get_ve0()->ptm_driver = ptm_driver; ++ /* don't clean ptm_driver and co. here, they are used in vecalls.c */ ++#endif ++} + #endif + + static void pty_close(struct tty_struct * tty, struct file * filp) + { + if (!tty) + return; ++ ++ ub_pty_uncharge(tty); + if (tty->driver->subtype == PTY_TYPE_MASTER) { + if (tty->count > 1) + printk("master pty_close: count = %d!!\n", tty->count); +@@ -61,8 +75,12 @@ static void pty_close(struct tty_struct + if (tty->driver->subtype == PTY_TYPE_MASTER) { + set_bit(TTY_OTHER_CLOSED, &tty->flags); + #ifdef CONFIG_UNIX98_PTYS +- if (tty->driver == ptm_driver) ++ if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) { ++ struct ve_struct *old_env; ++ old_env = set_exec_env(VE_OWNER_TTY(tty)); + devpts_pty_kill(tty->index); ++ (void)set_exec_env(old_env); ++ } + #endif + tty_vhangup(tty->link); + } +@@ -212,6 +230,10 @@ static int pty_open(struct tty_struct *t + if (tty->link->count != 1) + goto out; + ++ retval = -ENODEV; ++ if (ub_pty_charge(tty)) ++ goto out; ++ + clear_bit(TTY_OTHER_CLOSED, &tty->link->flags); + set_bit(TTY_THROTTLED, &tty->flags); + set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); +@@ -239,7 +261,9 @@ static struct tty_operations pty_ops = { + + /* Traditional BSD devices */ + #ifdef CONFIG_LEGACY_PTYS +-static struct tty_driver *pty_driver, *pty_slave_driver; ++struct tty_driver *pty_driver, *pty_slave_driver; ++EXPORT_SYMBOL(pty_driver); ++EXPORT_SYMBOL(pty_slave_driver); + + static int pty_bsd_ioctl(struct tty_struct *tty, struct file *file, + unsigned int cmd, unsigned long arg) +@@ -397,6 +421,7 @@ static void __init unix98_pty_init(void) + panic("Couldn't register Unix98 pts driver"); + + pty_table[1].data = &ptm_driver->refcount; ++ prepare_pty(); + } + #else + static inline void unix98_pty_init(void) { } +diff -uprN linux-2.6.15.orig/drivers/char/snsc_event.c linux-2.6.15-ve025stab014/drivers/char/snsc_event.c +--- linux-2.6.15.orig/drivers/char/snsc_event.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/drivers/char/snsc_event.c 2006-01-27 14:48:07.000000000 +0300 +@@ -206,7 +206,7 @@ scdrv_dispatch_event(char *event, int le + + /* first find init's task */ + read_lock(&tasklist_lock); +- for_each_process(p) { ++ for_each_process_all(p) { + if (p->pid == 1) + break; + } +diff -uprN linux-2.6.15.orig/drivers/char/sysrq.c linux-2.6.15-ve025stab014/drivers/char/sysrq.c +--- linux-2.6.15.orig/drivers/char/sysrq.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/drivers/char/sysrq.c 2006-01-27 14:48:07.000000000 +0300 +@@ -206,7 +206,7 @@ static void send_sig_all(int sig) + { + struct task_struct *p; + +- for_each_process(p) { ++ for_each_process_all(p) { + if (p->mm && p->pid != 1) + /* Not swapper, init nor kernel thread */ + force_sig(sig, p); +diff -uprN linux-2.6.15.orig/drivers/char/tty_io.c linux-2.6.15-ve025stab014/drivers/char/tty_io.c +--- linux-2.6.15.orig/drivers/char/tty_io.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/drivers/char/tty_io.c 2006-01-27 14:48:07.000000000 +0300 +@@ -86,6 +86,7 @@ + #include <linux/string.h> + #include <linux/slab.h> + #include <linux/poll.h> ++#include <linux/ve_owner.h> + #include <linux/proc_fs.h> + #include <linux/init.h> + #include <linux/module.h> +@@ -105,6 +106,7 @@ + #include <linux/devfs_fs_kernel.h> + + #include <linux/kmod.h> ++#include <ub/ub_mem.h> + + #undef TTY_DEBUG_HANGUP + +@@ -122,11 +124,16 @@ struct termios tty_std_termios = { /* fo + + EXPORT_SYMBOL(tty_std_termios); + ++/* this lock protects tty_drivers list, this pretty guys do no locking */ ++rwlock_t tty_driver_guard = RW_LOCK_UNLOCKED; ++EXPORT_SYMBOL(tty_driver_guard); ++ + /* This list gets poked at by procfs and various bits of boot up code. This + could do with some rationalisation such as pulling the tty proc function + into this file */ + + LIST_HEAD(tty_drivers); /* linked list of tty drivers */ ++EXPORT_SYMBOL(tty_drivers); + + /* Semaphore to protect creating and releasing a tty. This is shared with + vt.c for deeply disgusting hack reasons */ +@@ -136,6 +143,15 @@ DECLARE_MUTEX(tty_sem); + extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ + extern int pty_limit; /* Config limit on Unix98 ptys */ + static DEFINE_IDR(allocated_ptys); ++#ifdef CONFIG_VE ++#define __ve_allocated_ptys(ve) (*((ve)->allocated_ptys)) ++#define ve_allocated_ptys __ve_allocated_ptys(get_exec_env()) ++#define ve_ptm_driver (get_exec_env()->ptm_driver) ++#else ++#define __ve_allocated_ptys(ve) allocated_ptys ++#define ve_allocated_ptys allocated_ptys ++#define ve_ptm_driver ptm_driver ++#endif + static DECLARE_MUTEX(allocated_ptys_lock); + static int ptmx_open(struct inode *, struct file *); + #endif +@@ -156,11 +172,25 @@ static int tty_fasync(int fd, struct fil + static void release_mem(struct tty_struct *tty, int idx); + + ++DCL_VE_OWNER(TTYDRV, struct tty_driver, owner_env) ++DCL_VE_OWNER(TTY, struct tty_struct, owner_env) ++ ++void prepare_tty(void) ++{ ++#ifdef CONFIG_VE ++ get_ve0()->allocated_ptys = &allocated_ptys; ++ /* ++ * in this case, tty_register_driver() setups ++ * owner_env correctly right from the bootup ++ */ ++#endif ++} ++ + static struct tty_struct *alloc_tty_struct(void) + { + struct tty_struct *tty; + +- tty = kmalloc(sizeof(struct tty_struct), GFP_KERNEL); ++ tty = ub_kmalloc(sizeof(struct tty_struct), GFP_KERNEL); + if (tty) + memset(tty, 0, sizeof(struct tty_struct)); + return tty; +@@ -627,14 +657,37 @@ static struct tty_driver *get_tty_driver + { + struct tty_driver *p; + ++ read_lock(&tty_driver_guard); + list_for_each_entry(p, &tty_drivers, tty_drivers) { + dev_t base = MKDEV(p->major, p->minor_start); + if (device < base || device >= base + p->num) + continue; + *index = device - base; +- return p; ++#ifdef CONFIG_VE ++ if (in_interrupt()) ++ goto found; ++ if (p->major!=PTY_MASTER_MAJOR && p->major!=PTY_SLAVE_MAJOR ++#ifdef CONFIG_UNIX98_PTYS ++ && (p->major<UNIX98_PTY_MASTER_MAJOR || ++ p->major>UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) && ++ (p->major<UNIX98_PTY_SLAVE_MAJOR || ++ p->major>UNIX98_PTY_SLAVE_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) ++#endif ++ ) goto found; ++ if (ve_is_super(VE_OWNER_TTYDRV(p)) && ++ ve_is_super(get_exec_env())) ++ goto found; ++ if (!ve_accessible_strict(VE_OWNER_TTYDRV(p), get_exec_env())) ++ continue; ++#endif ++ goto found; + } ++ read_unlock(&tty_driver_guard); + return NULL; ++ ++found: ++ read_unlock(&tty_driver_guard); ++ return p; + } + + /* +@@ -862,7 +915,7 @@ static void do_tty_hangup(void *data) + + read_lock(&tasklist_lock); + if (tty->session > 0) { +- do_each_task_pid(tty->session, PIDTYPE_SID, p) { ++ do_each_task_pid_all(tty->session, PIDTYPE_SID, p) { + if (p->signal->tty == tty) + p->signal->tty = NULL; + if (!p->signal->leader) +@@ -871,7 +924,7 @@ static void do_tty_hangup(void *data) + send_group_sig_info(SIGCONT, SEND_SIG_PRIV, p); + if (tty->pgrp > 0) + p->signal->tty_old_pgrp = tty->pgrp; +- } while_each_task_pid(tty->session, PIDTYPE_SID, p); ++ } while_each_task_pid_all(tty->session, PIDTYPE_SID, p); + } + read_unlock(&tasklist_lock); + +@@ -988,9 +1041,9 @@ void disassociate_ctty(int on_exit) + + /* Now clear signal->tty under the lock */ + read_lock(&tasklist_lock); +- do_each_task_pid(current->signal->session, PIDTYPE_SID, p) { ++ do_each_task_pid_all(current->signal->session, PIDTYPE_SID, p) { + p->signal->tty = NULL; +- } while_each_task_pid(current->signal->session, PIDTYPE_SID, p); ++ } while_each_task_pid_all(current->signal->session, PIDTYPE_SID, p); + read_unlock(&tasklist_lock); + up(&tty_sem); + unlock_kernel(); +@@ -1216,21 +1269,28 @@ static inline void tty_line_name(struct + * really quite straightforward. The semaphore locking can probably be + * relaxed for the (most common) case of reopening a tty. + */ +-static int init_dev(struct tty_driver *driver, int idx, +- struct tty_struct **ret_tty) ++static int init_dev(struct tty_driver *driver, int idx, ++ struct tty_struct *i_tty, struct tty_struct **ret_tty) + { + struct tty_struct *tty, *o_tty; + struct termios *tp, **tp_loc, *o_tp, **o_tp_loc; + struct termios *ltp, **ltp_loc, *o_ltp, **o_ltp_loc; ++ struct ve_struct * owner; + int retval=0; + +- /* check whether we're reopening an existing tty */ +- if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { +- tty = devpts_get_tty(idx); +- if (tty && driver->subtype == PTY_TYPE_MASTER) +- tty = tty->link; +- } else { +- tty = driver->ttys[idx]; ++ owner = VE_OWNER_TTYDRV(driver); ++ ++ if (i_tty) ++ tty = i_tty; ++ else { ++ /* check whether we're reopening an existing tty */ ++ if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { ++ tty = devpts_get_tty(idx); ++ if (tty && driver->subtype == PTY_TYPE_MASTER) ++ tty = tty->link; ++ } else { ++ tty = driver->ttys[idx]; ++ } + } + if (tty) goto fast_track; + +@@ -1258,6 +1318,7 @@ static int init_dev(struct tty_driver *d + tty->driver = driver; + tty->index = idx; + tty_line_name(driver, idx, tty->name); ++ SET_VE_OWNER_TTY(tty, owner); + + if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { + tp_loc = &tty->termios; +@@ -1268,7 +1329,7 @@ static int init_dev(struct tty_driver *d + } + + if (!*tp_loc) { +- tp = (struct termios *) kmalloc(sizeof(struct termios), ++ tp = (struct termios *) ub_kmalloc(sizeof(struct termios), + GFP_KERNEL); + if (!tp) + goto free_mem_out; +@@ -1276,7 +1337,7 @@ static int init_dev(struct tty_driver *d + } + + if (!*ltp_loc) { +- ltp = (struct termios *) kmalloc(sizeof(struct termios), ++ ltp = (struct termios *) ub_kmalloc(sizeof(struct termios), + GFP_KERNEL); + if (!ltp) + goto free_mem_out; +@@ -1291,6 +1352,7 @@ static int init_dev(struct tty_driver *d + o_tty->driver = driver->other; + o_tty->index = idx; + tty_line_name(driver->other, idx, o_tty->name); ++ SET_VE_OWNER_TTY(o_tty, owner); + + if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { + o_tp_loc = &o_tty->termios; +@@ -1302,7 +1364,7 @@ static int init_dev(struct tty_driver *d + + if (!*o_tp_loc) { + o_tp = (struct termios *) +- kmalloc(sizeof(struct termios), GFP_KERNEL); ++ ub_kmalloc(sizeof(struct termios), GFP_KERNEL); + if (!o_tp) + goto free_mem_out; + *o_tp = driver->other->init_termios; +@@ -1310,7 +1372,7 @@ static int init_dev(struct tty_driver *d + + if (!*o_ltp_loc) { + o_ltp = (struct termios *) +- kmalloc(sizeof(struct termios), GFP_KERNEL); ++ ub_kmalloc(sizeof(struct termios), GFP_KERNEL); + if (!o_ltp) + goto free_mem_out; + memset(o_ltp, 0, sizeof(struct termios)); +@@ -1328,6 +1390,10 @@ static int init_dev(struct tty_driver *d + *o_ltp_loc = o_ltp; + o_tty->termios = *o_tp_loc; + o_tty->termios_locked = *o_ltp_loc; ++#ifdef CONFIG_VE ++ if (driver->other->refcount == 0) ++ (void)get_ve(owner); ++#endif + driver->other->refcount++; + if (driver->subtype == PTY_TYPE_MASTER) + o_tty->count++; +@@ -1352,6 +1418,10 @@ static int init_dev(struct tty_driver *d + *ltp_loc = ltp; + tty->termios = *tp_loc; + tty->termios_locked = *ltp_loc; ++#ifdef CONFIG_VE ++ if (driver->refcount == 0) ++ (void)get_ve(owner); ++#endif + driver->refcount++; + tty->count++; + +@@ -1462,6 +1532,10 @@ static void release_mem(struct tty_struc + } + o_tty->magic = 0; + o_tty->driver->refcount--; ++#ifdef CONFIG_VE ++ if (o_tty->driver->refcount == 0) ++ put_ve(VE_OWNER_TTY(o_tty)); ++#endif + file_list_lock(); + list_del_init(&o_tty->tty_files); + file_list_unlock(); +@@ -1484,6 +1558,10 @@ static void release_mem(struct tty_struc + + tty->magic = 0; + tty->driver->refcount--; ++#ifdef CONFIG_VE ++ if (tty->driver->refcount == 0) ++ put_ve(VE_OWNER_TTY(tty)); ++#endif + file_list_lock(); + list_del_init(&tty->tty_files); + file_list_unlock(); +@@ -1507,7 +1585,10 @@ static void release_dev(struct file * fi + int idx; + char buf[64]; + unsigned long flags; +- ++#ifdef CONFIG_UNIX98_PTYS ++ struct idr *idr_alloced; ++#endif ++ + tty = (struct tty_struct *)filp->private_data; + if (tty_paranoia_check(tty, filp->f_dentry->d_inode, "release_dev")) + return; +@@ -1522,6 +1603,9 @@ static void release_dev(struct file * fi + devpts = (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) != 0; + devpts_master = pty_master && devpts; + o_tty = tty->link; ++#ifdef CONFIG_UNIX98_PTYS ++ idr_alloced = &__ve_allocated_ptys(tty->owner_env); ++#endif + + #ifdef TTY_PARANOIA_CHECK + if (idx < 0 || idx >= tty->driver->num) { +@@ -1697,13 +1781,13 @@ static void release_dev(struct file * fi + struct task_struct *p; + + read_lock(&tasklist_lock); +- do_each_task_pid(tty->session, PIDTYPE_SID, p) { ++ do_each_task_pid_all(tty->session, PIDTYPE_SID, p) { + p->signal->tty = NULL; +- } while_each_task_pid(tty->session, PIDTYPE_SID, p); ++ } while_each_task_pid_all(tty->session, PIDTYPE_SID, p); + if (o_tty) +- do_each_task_pid(o_tty->session, PIDTYPE_SID, p) { ++ do_each_task_pid_all(o_tty->session, PIDTYPE_SID, p) { + p->signal->tty = NULL; +- } while_each_task_pid(o_tty->session, PIDTYPE_SID, p); ++ } while_each_task_pid_all(o_tty->session, PIDTYPE_SID, p); + read_unlock(&tasklist_lock); + } + +@@ -1776,7 +1860,7 @@ static void release_dev(struct file * fi + /* Make this pty number available for reallocation */ + if (devpts) { + down(&allocated_ptys_lock); +- idr_remove(&allocated_ptys, idx); ++ idr_remove(idr_alloced, idx); + up(&allocated_ptys_lock); + } + #endif +@@ -1797,7 +1881,7 @@ static void release_dev(struct file * fi + */ + static int tty_open(struct inode * inode, struct file * filp) + { +- struct tty_struct *tty; ++ struct tty_struct *tty, *c_tty; + int noctty, retval; + struct tty_driver *driver; + int index; +@@ -1810,6 +1894,7 @@ retry_open: + noctty = filp->f_flags & O_NOCTTY; + index = -1; + retval = 0; ++ c_tty = NULL; + + down(&tty_sem); + +@@ -1820,6 +1905,7 @@ retry_open: + } + driver = current->signal->tty->driver; + index = current->signal->tty->index; ++ c_tty = current->signal->tty; + filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ + /* noctty = 1; */ + goto got_driver; +@@ -1827,6 +1913,12 @@ retry_open: + #ifdef CONFIG_VT + if (device == MKDEV(TTY_MAJOR,0)) { + extern struct tty_driver *console_driver; ++#ifdef CONFIG_VE ++ if (!ve_is_super(get_exec_env())) { ++ up(&tty_sem); ++ return -ENODEV; ++ } ++#endif + driver = console_driver; + index = fg_console; + noctty = 1; +@@ -1834,6 +1926,12 @@ retry_open: + } + #endif + if (device == MKDEV(TTYAUX_MAJOR,1)) { ++#ifdef CONFIG_VE ++ if (!ve_is_super(get_exec_env())) { ++ up(&tty_sem); ++ return -ENODEV; ++ } ++#endif + driver = console_device(&index); + if (driver) { + /* Don't let /dev/console block */ +@@ -1851,7 +1949,7 @@ retry_open: + return -ENODEV; + } + got_driver: +- retval = init_dev(driver, index, &tty); ++ retval = init_dev(driver, index, c_tty, &tty); + up(&tty_sem); + if (retval) + return retval; +@@ -1920,11 +2018,11 @@ static int ptmx_open(struct inode * inod + + /* find a device that is not in use. */ + down(&allocated_ptys_lock); +- if (!idr_pre_get(&allocated_ptys, GFP_KERNEL)) { ++ if (!idr_pre_get(&ve_allocated_ptys, GFP_KERNEL)) { + up(&allocated_ptys_lock); + return -ENOMEM; + } +- idr_ret = idr_get_new(&allocated_ptys, NULL, &index); ++ idr_ret = idr_get_new(&ve_allocated_ptys, NULL, &index); + if (idr_ret < 0) { + up(&allocated_ptys_lock); + if (idr_ret == -EAGAIN) +@@ -1932,14 +2030,14 @@ static int ptmx_open(struct inode * inod + return -EIO; + } + if (index >= pty_limit) { +- idr_remove(&allocated_ptys, index); ++ idr_remove(&ve_allocated_ptys, index); + up(&allocated_ptys_lock); + return -EIO; + } + up(&allocated_ptys_lock); + + down(&tty_sem); +- retval = init_dev(ptm_driver, index, &tty); ++ retval = init_dev(ve_ptm_driver, index, NULL, &tty); + up(&tty_sem); + + if (retval) +@@ -1954,14 +2052,14 @@ static int ptmx_open(struct inode * inod + goto out1; + + check_tty_count(tty, "tty_open"); +- retval = ptm_driver->open(tty, filp); ++ retval = ve_ptm_driver->open(tty, filp); + if (!retval) + return 0; + out1: + release_dev(filp); + out: + down(&allocated_ptys_lock); +- idr_remove(&allocated_ptys, index); ++ idr_remove(&ve_allocated_ptys, index); + up(&allocated_ptys_lock); + return retval; + } +@@ -2074,6 +2172,8 @@ static int tioccons(struct file *file) + { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; ++ if (!ve_is_super(get_exec_env())) ++ return -EACCES; + if (file->f_op->write == redirected_tty_write) { + struct file *f; + spin_lock(&redirect_lock); +@@ -2134,9 +2234,9 @@ static int tiocsctty(struct tty_struct * + */ + + read_lock(&tasklist_lock); +- do_each_task_pid(tty->session, PIDTYPE_SID, p) { ++ do_each_task_pid_all(tty->session, PIDTYPE_SID, p) { + p->signal->tty = NULL; +- } while_each_task_pid(tty->session, PIDTYPE_SID, p); ++ } while_each_task_pid_all(tty->session, PIDTYPE_SID, p); + read_unlock(&tasklist_lock); + } else + return -EPERM; +@@ -2158,7 +2258,7 @@ static int tiocgpgrp(struct tty_struct * + */ + if (tty == real_tty && current->signal->tty != real_tty) + return -ENOTTY; +- return put_user(real_tty->pgrp, p); ++ return put_user(pid_type_to_vpid(PIDTYPE_PGID, real_tty->pgrp), p); + } + + static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) +@@ -2178,6 +2278,9 @@ static int tiocspgrp(struct tty_struct * + return -EFAULT; + if (pgrp < 0) + return -EINVAL; ++ pgrp = vpid_to_pid(pgrp); ++ if (pgrp < 0) ++ return -EPERM; + if (session_of_pgrp(pgrp) != current->signal->session) + return -EPERM; + real_tty->pgrp = pgrp; +@@ -2194,7 +2297,7 @@ static int tiocgsid(struct tty_struct *t + return -ENOTTY; + if (real_tty->session <= 0) + return -ENOTTY; +- return put_user(real_tty->session, p); ++ return put_user(pid_type_to_vpid(PIDTYPE_SID, real_tty->session), p); + } + + static int tiocsetd(struct tty_struct *tty, int __user *p) +@@ -2467,7 +2570,7 @@ static void __do_SAK(void *arg) + tty->driver->flush_buffer(tty); + + read_lock(&tasklist_lock); +- do_each_task_pid(session, PIDTYPE_SID, p) { ++ do_each_task_pid_all(session, PIDTYPE_SID, p) { + if (p->signal->tty == tty || session > 0) { + printk(KERN_NOTICE "SAK: killed process %d" + " (%s): p->signal->session==tty->session\n", +@@ -2495,7 +2598,7 @@ static void __do_SAK(void *arg) + rcu_read_unlock(); + } + task_unlock(p); +- } while_each_task_pid(session, PIDTYPE_SID, p); ++ } while_each_task_pid_all(session, PIDTYPE_SID, p); + read_unlock(&tasklist_lock); + #endif + } +@@ -2857,8 +2960,11 @@ int tty_register_driver(struct tty_drive + + if (!driver->put_char) + driver->put_char = tty_default_put_char; +- ++ ++ SET_VE_OWNER_TTYDRV(driver, get_exec_env()); ++ write_lock_irq(&tty_driver_guard); + list_add(&driver->tty_drivers, &tty_drivers); ++ write_unlock_irq(&tty_driver_guard); + + if ( !(driver->flags & TTY_DRIVER_NO_DEVFS) ) { + for(i = 0; i < driver->num; i++) +@@ -2885,7 +2991,9 @@ int tty_unregister_driver(struct tty_dri + unregister_chrdev_region(MKDEV(driver->major, driver->minor_start), + driver->num); + ++ write_lock_irq(&tty_driver_guard); + list_del(&driver->tty_drivers); ++ write_unlock_irq(&tty_driver_guard); + + /* + * Free the termios and termios_locked structures because +@@ -3008,6 +3116,7 @@ static int __init tty_init(void) + + vty_init(); + #endif ++ prepare_tty(); + return 0; + } + module_init(tty_init); +diff -uprN linux-2.6.15.orig/drivers/net/Makefile linux-2.6.15-ve025stab014/drivers/net/Makefile +--- linux-2.6.15.orig/drivers/net/Makefile 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/drivers/net/Makefile 2006-01-27 14:48:08.000000000 +0300 +@@ -15,6 +15,9 @@ obj-$(CONFIG_GIANFAR) += gianfar_driver. + + gianfar_driver-objs := gianfar.o gianfar_ethtool.o gianfar_mii.o + ++obj-$(CONFIG_VE_NETDEV) += vznetdev.o ++vznetdev-objs := open_vznet.o venet_core.o ++ + # + # link order important here + # +diff -uprN linux-2.6.15.orig/drivers/net/loopback.c linux-2.6.15-ve025stab014/drivers/net/loopback.c +--- linux-2.6.15.orig/drivers/net/loopback.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/drivers/net/loopback.c 2006-01-27 14:48:07.000000000 +0300 +@@ -198,6 +198,30 @@ static struct ethtool_ops loopback_ethto + .set_tso = ethtool_op_set_tso, + }; + ++static void loopback_destructor(struct net_device *dev) ++{ ++ kfree(dev->priv); ++ dev->priv = NULL; ++} ++ ++struct net_device templ_loopback_dev = { ++ .name = "lo", ++ .mtu = (16 * 1024) + 20 + 20 + 12, ++ .hard_start_xmit = loopback_xmit, ++ .hard_header = eth_header, ++ .hard_header_cache = eth_header_cache, ++ .header_cache_update = eth_header_cache_update, ++ .hard_header_len = ETH_HLEN, /* 14 */ ++ .addr_len = ETH_ALEN, /* 6 */ ++ .tx_queue_len = 0, ++ .type = ARPHRD_LOOPBACK, /* 0x0001*/ ++ .rebuild_header = eth_rebuild_header, ++ .flags = IFF_LOOPBACK, ++ .features = NETIF_F_SG|NETIF_F_FRAGLIST ++ |NETIF_F_NO_CSUM|NETIF_F_HIGHDMA ++ |NETIF_F_LLTX, ++}; ++ + struct net_device loopback_dev = { + .name = "lo", + .mtu = (16 * 1024) + 20 + 20 + 12, +@@ -231,9 +255,11 @@ int __init loopback_init(void) + memset(stats, 0, sizeof(struct net_device_stats)); + loopback_dev.priv = stats; + loopback_dev.get_stats = &get_stats; ++ loopback_dev.destructor = &loopback_destructor; + } + + return register_netdev(&loopback_dev); + }; + + EXPORT_SYMBOL(loopback_dev); ++EXPORT_SYMBOL(templ_loopback_dev); +diff -uprN linux-2.6.15.orig/drivers/net/open_vznet.c linux-2.6.15-ve025stab014/drivers/net/open_vznet.c +--- linux-2.6.15.orig/drivers/net/open_vznet.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/drivers/net/open_vznet.c 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,190 @@ ++/* ++ * open_vznet.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++/* ++ * Virtual Networking device used to change VE ownership on packets ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/module.h> ++#include <linux/seq_file.h> ++ ++#include <linux/inet.h> ++#include <net/ip.h> ++#include <linux/skbuff.h> ++#include <linux/venet.h> ++ ++void veip_stop(struct ve_struct *ve) ++{ ++ struct list_head *p, *tmp; ++ ++ write_lock_irq(&veip_hash_lock); ++ if (ve->veip == NULL) ++ goto unlock; ++ list_for_each_safe(p, tmp, &ve->veip->ip_lh) { ++ struct ip_entry_struct *ptr; ++ ptr = list_entry(p, struct ip_entry_struct, ve_list); ++ ptr->active_env = NULL; ++ list_del(&ptr->ve_list); ++ list_del(&ptr->ip_hash); ++ kfree(ptr); ++ } ++ veip_put(ve->veip); ++ ve->veip = NULL; ++unlock: ++ write_unlock_irq(&veip_hash_lock); ++} ++ ++int veip_start(struct ve_struct *ve) ++{ ++ int err; ++ ++ err = 0; ++ write_lock_irq(&veip_hash_lock); ++ ve->veip = veip_findcreate(ve->veid); ++ if (ve->veip == NULL) ++ err = -ENOMEM; ++ write_unlock_irq(&veip_hash_lock); ++ return err; ++} ++ ++int veip_entry_add(struct ve_struct *ve, struct sockaddr_in *addr) ++{ ++ struct ip_entry_struct *entry, *found; ++ int err; ++ ++ entry = kmalloc(sizeof(struct ip_entry_struct), GFP_KERNEL); ++ if (entry == NULL) ++ return -ENOMEM; ++ ++ memset(entry, 0, sizeof(struct ip_entry_struct)); ++ entry->ip = addr->sin_addr.s_addr; ++ ++ write_lock_irq(&veip_hash_lock); ++ err = -EADDRINUSE; ++ found = ip_entry_lookup(entry->ip); ++ if (found != NULL) ++ goto out_unlock; ++ else { ++ ip_entry_hash(entry, ve->veip); ++ found = entry; ++ entry = NULL; ++ } ++ err = 0; ++ found->active_env = ve; ++out_unlock: ++ write_unlock_irq(&veip_hash_lock); ++ if (entry != NULL) ++ kfree(entry); ++ return err; ++} ++ ++int veip_entry_del(envid_t veid, struct sockaddr_in *addr) ++{ ++ struct ip_entry_struct *found; ++ int err; ++ ++ err = -EADDRNOTAVAIL; ++ write_lock_irq(&veip_hash_lock); ++ found = ip_entry_lookup(addr->sin_addr.s_addr); ++ if (found == NULL) ++ goto out; ++ if (found->active_env->veid != veid) ++ goto out; ++ ++ err = 0; ++ found->active_env = NULL; ++ ++ list_del(&found->ip_hash); ++ list_del(&found->ve_list); ++ kfree(found); ++out: ++ write_unlock_irq(&veip_hash_lock); ++ return err; ++} ++ ++static struct ve_struct *venet_find_ve(__u32 ip) ++{ ++ struct ip_entry_struct *entry; ++ ++ entry = ip_entry_lookup(ip); ++ if (entry == NULL) ++ return NULL; ++ ++ return entry->active_env; ++} ++ ++int venet_change_skb_owner(struct sk_buff *skb) ++{ ++ struct ve_struct *ve, *ve_old; ++ struct iphdr *iph; ++ ++ ve_old = skb->owner_env; ++ iph = skb->nh.iph; ++ ++ read_lock(&veip_hash_lock); ++ if (!ve_is_super(ve_old)) { ++ /* from VE to host */ ++ ve = venet_find_ve(iph->saddr); ++ if (ve == NULL) ++ goto out_drop; ++ if (!ve_accessible_strict(ve, ve_old)) ++ goto out_source; ++ skb->owner_env = get_ve0(); ++ } else { ++ /* from host to VE */ ++ ve = venet_find_ve(iph->daddr); ++ if (ve == NULL) ++ goto out_drop; ++ skb->owner_env = ve; ++ } ++ read_unlock(&veip_hash_lock); ++ ++ return 0; ++ ++out_drop: ++ read_unlock(&veip_hash_lock); ++ return -ESRCH; ++ ++out_source: ++ read_unlock(&veip_hash_lock); ++ if (net_ratelimit()) { ++ printk(KERN_WARNING "Dropped packet, source wrong " ++ "veid=%u src-IP=%u.%u.%u.%u " ++ "dst-IP=%u.%u.%u.%u\n", ++ skb->owner_env->veid, ++ NIPQUAD(skb->nh.iph->saddr), ++ NIPQUAD(skb->nh.iph->daddr)); ++ } ++ return -EACCES; ++} ++ ++#ifdef CONFIG_PROC_FS ++int veip_seq_show(struct seq_file *m, void *v) ++{ ++ struct list_head *p; ++ struct ip_entry_struct *entry; ++ char s[16]; ++ ++ p = (struct list_head *)v; ++ if (p == ip_entry_hash_table) { ++ seq_puts(m, "Version: 2.5\n"); ++ return 0; ++ } ++ entry = list_entry(p, struct ip_entry_struct, ip_hash); ++ sprintf(s, "%u.%u.%u.%u", NIPQUAD(entry->ip)); ++ seq_printf(m, "%15s %10u\n", s, 0); ++ return 0; ++} ++#endif ++ ++MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); ++MODULE_DESCRIPTION("Virtuozzo Virtual Network Device"); ++MODULE_LICENSE("GPL v2"); +diff -uprN linux-2.6.15.orig/drivers/net/tun.c linux-2.6.15-ve025stab014/drivers/net/tun.c +--- linux-2.6.15.orig/drivers/net/tun.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/drivers/net/tun.c 2006-01-27 14:48:08.000000000 +0300 +@@ -62,6 +62,7 @@ + + #include <asm/system.h> + #include <asm/uaccess.h> ++#include <ub/beancounter.h> + + #ifdef TUN_DEBUG + static int debug; +@@ -90,6 +91,7 @@ static int tun_net_close(struct net_devi + static int tun_net_xmit(struct sk_buff *skb, struct net_device *dev) + { + struct tun_struct *tun = netdev_priv(dev); ++ struct user_beancounter *ub; + + DBG(KERN_INFO "%s: tun_net_xmit %d\n", tun->dev->name, skb->len); + +@@ -114,6 +116,18 @@ static int tun_net_xmit(struct sk_buff * + } + } + ++ ub = netdev_bc(dev)->exec_ub; ++ if (ub && (skb_bc(skb)->charged == 0)) { ++ unsigned long charge; ++ charge = skb_charge_fullsize(skb); ++ if (charge_beancounter(ub, UB_OTHERSOCKBUF, charge, 1)) ++ goto drop; ++ get_beancounter(ub); ++ skb_bc(skb)->ub = ub; ++ skb_bc(skb)->charged = charge; ++ skb_bc(skb)->resource = UB_OTHERSOCKBUF; ++ } ++ + /* Queue packet */ + skb_queue_tail(&tun->readq, skb); + dev->trans_start = jiffies; +@@ -407,12 +421,15 @@ static ssize_t tun_chr_readv(struct file + tun->dev->name, addr[0], addr[1], addr[2], + addr[3], addr[4], addr[5]); + ret = tun_put_user(tun, skb, (struct iovec *) iv, len); ++ ++ /* skb will be uncharged in kfree_skb() */ + kfree_skb(skb); + break; + } else { + DBG(KERN_DEBUG "%s: tun_chr_readv: rejected: %x:%x:%x:%x:%x:%x\n", + tun->dev->name, addr[0], addr[1], addr[2], + addr[3], addr[4], addr[5]); ++ /* skb will be uncharged in kfree_skb() */ + kfree_skb(skb); + continue; + } +@@ -476,7 +493,8 @@ static int tun_set_iff(struct file *file + + /* Check permissions */ + if (tun->owner != -1 && +- current->euid != tun->owner && !capable(CAP_NET_ADMIN)) ++ current->euid != tun->owner && ++ !capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) + return -EPERM; + } + else if (__dev_get_by_name(ifr->ifr_name)) +diff -uprN linux-2.6.15.orig/drivers/net/venet_core.c linux-2.6.15-ve025stab014/drivers/net/venet_core.c +--- linux-2.6.15.orig/drivers/net/venet_core.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/drivers/net/venet_core.c 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,625 @@ ++/* ++ * venet_core.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++/* ++ * Common part for Virtuozzo virtual network devices ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/interrupt.h> ++#include <linux/fs.h> ++#include <linux/types.h> ++#include <linux/string.h> ++#include <linux/socket.h> ++#include <linux/errno.h> ++#include <linux/fcntl.h> ++#include <linux/in.h> ++#include <linux/init.h> ++#include <linux/module.h> ++#include <linux/tcp.h> ++#include <linux/proc_fs.h> ++#include <linux/seq_file.h> ++ ++#include <asm/system.h> ++#include <asm/uaccess.h> ++#include <asm/io.h> ++#include <asm/unistd.h> ++ ++#include <linux/inet.h> ++#include <linux/netdevice.h> ++#include <linux/etherdevice.h> ++#include <net/ip.h> ++#include <linux/skbuff.h> ++#include <net/sock.h> ++#include <linux/if_ether.h> /* For the statistics structure. */ ++#include <linux/if_arp.h> /* For ARPHRD_ETHER */ ++#include <linux/venet.h> ++#include <linux/ve_proto.h> ++#include <linux/vzctl.h> ++#include <linux/vzctl_venet.h> ++ ++struct list_head ip_entry_hash_table[VEIP_HASH_SZ]; ++rwlock_t veip_hash_lock = RW_LOCK_UNLOCKED; ++LIST_HEAD(veip_lh); ++ ++#define ip_entry_hash_function(ip) (ntohl(ip) & (VEIP_HASH_SZ - 1)) ++ ++void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip) ++{ ++ list_add(&entry->ip_hash, ++ ip_entry_hash_table + ip_entry_hash_function(entry->ip)); ++ list_add(&entry->ve_list, &veip->ip_lh); ++} ++ ++void veip_put(struct veip_struct *veip) ++{ ++ if (!list_empty(&veip->ip_lh)) ++ return; ++ if (!list_empty(&veip->src_lh)) ++ return; ++ if (!list_empty(&veip->dst_lh)) ++ return; ++ ++ list_del(&veip->list); ++ kfree(veip); ++} ++ ++struct ip_entry_struct *ip_entry_lookup(u32 addr) ++{ ++ struct ip_entry_struct *entry; ++ struct list_head *tmp; ++ ++ list_for_each(tmp, ip_entry_hash_table + ip_entry_hash_function(addr)) { ++ entry = list_entry(tmp, struct ip_entry_struct, ip_hash); ++ if (entry->ip != addr) ++ continue; ++ return entry; ++ } ++ return NULL; ++} ++ ++struct veip_struct *veip_find(envid_t veid) ++{ ++ struct veip_struct *ptr; ++ list_for_each_entry(ptr, &veip_lh, list) { ++ if (ptr->veid != veid) ++ continue; ++ return ptr; ++ } ++ return NULL; ++} ++ ++struct veip_struct *veip_findcreate(envid_t veid) ++{ ++ struct veip_struct *ptr; ++ ++ ptr = veip_find(veid); ++ if (ptr != NULL) ++ return ptr; ++ ++ ptr = kmalloc(sizeof(struct veip_struct), GFP_ATOMIC); ++ if (ptr == NULL) ++ return NULL; ++ memset(ptr, 0, sizeof(struct veip_struct)); ++ INIT_LIST_HEAD(&ptr->ip_lh); ++ INIT_LIST_HEAD(&ptr->src_lh); ++ INIT_LIST_HEAD(&ptr->dst_lh); ++ list_add(&ptr->list, &veip_lh); ++ ptr->veid = veid; ++ return ptr; ++} ++ ++/* ++ * Device functions ++ */ ++ ++static int venet_open(struct net_device *dev) ++{ ++ if (!try_module_get(THIS_MODULE)) ++ return -EBUSY; ++ return 0; ++} ++ ++static int venet_close(struct net_device *master) ++{ ++ module_put(THIS_MODULE); ++ return 0; ++} ++ ++static void venet_destructor(struct net_device *dev) ++{ ++ kfree(dev->priv); ++ dev->priv = NULL; ++} ++ ++/* ++ * The higher levels take care of making this non-reentrant (it's ++ * called with bh's disabled). ++ */ ++static int venet_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct net_device_stats *stats = (struct net_device_stats *)dev->priv; ++ struct net_device *rcv = NULL; ++ struct iphdr *iph; ++ int length; ++ ++ /* ++ * Optimise so buffers with skb->free=1 are not copied but ++ * instead are lobbed from tx queue to rx queue ++ */ ++ if (atomic_read(&skb->users) != 1) { ++ struct sk_buff *skb2 = skb; ++ skb = skb_clone(skb, GFP_ATOMIC); /* Clone the buffer */ ++ if (skb == NULL) { ++ kfree_skb(skb2); ++ goto out; ++ } ++ kfree_skb(skb2); ++ } else ++ skb_orphan(skb); ++ ++ if (skb->protocol != __constant_htons(ETH_P_IP)) ++ goto outf; ++ ++ iph = skb->nh.iph; ++ if (MULTICAST(iph->daddr)) ++ goto outf; ++ ++ if (venet_change_skb_owner(skb) < 0) ++ goto outf; ++ ++ rcv = VE_OWNER_SKB(skb)->_venet_dev; ++ if (!rcv) ++ /* VE going down */ ++ goto outf; ++ ++ dev_hold(rcv); ++ ++ if (!(rcv->flags & IFF_UP)) { ++ /* Target VE does not want to receive packets */ ++ dev_put(rcv); ++ goto outf; ++ } ++ ++ skb->pkt_type = PACKET_HOST; ++ skb->dev = rcv; ++ ++ skb->mac.raw = skb->data; ++ memset(skb->data - dev->hard_header_len, 0, dev->hard_header_len); ++ ++ dst_release(skb->dst); ++ skb->dst = NULL; ++#ifdef CONFIG_NETFILTER ++ nf_conntrack_put(skb->nfct); ++ skb->nfct = NULL; ++#ifdef CONFIG_NETFILTER_DEBUG ++ skb->nf_debug = 0; ++#endif ++#endif ++ length = skb->len; ++ ++ netif_rx(skb); ++ ++ stats->tx_bytes += length; ++ stats->tx_packets++; ++ if (rcv) { ++ struct net_device_stats *rcv_stats = ++ (struct net_device_stats *)rcv->priv; ++ rcv_stats->rx_bytes += length; ++ rcv_stats->rx_packets++; ++ dev_put(rcv); ++ } ++ ++ return 0; ++ ++outf: ++ kfree_skb(skb); ++ ++stats->tx_dropped; ++out: ++ return 0; ++} ++ ++static struct net_device_stats *get_stats(struct net_device *dev) ++{ ++ return (struct net_device_stats *)dev->priv; ++} ++ ++/* Initialize the rest of the LOOPBACK device. */ ++int venet_init_dev(struct net_device *dev) ++{ ++ dev->hard_start_xmit = venet_xmit; ++ dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); ++ if (dev->priv == NULL) ++ return -ENOMEM; ++ memset(dev->priv, 0, sizeof(struct net_device_stats)); ++ dev->get_stats = get_stats; ++ dev->open = venet_open; ++ dev->stop = venet_close; ++ dev->destructor = venet_destructor; ++ ++ /* ++ * Fill in the generic fields of the device structure. ++ */ ++ dev->type = ARPHRD_VOID; ++ dev->hard_header_len = ETH_HLEN; ++ dev->mtu = 1500; /* eth_mtu */ ++ dev->tx_queue_len = 0; ++ ++ memset(dev->broadcast, 0xFF, ETH_ALEN); ++ ++ /* New-style flags. */ ++ dev->flags = IFF_BROADCAST|IFF_NOARP|IFF_POINTOPOINT; ++ return 0; ++} ++ ++static void venet_setup(struct net_device *dev) ++{ ++ dev->init = venet_init_dev; ++ /* ++ * No other features, as they are: ++ * - checksumming is required, and nobody else will done our job ++ */ ++ dev->features |= NETIF_F_VENET; ++} ++ ++#ifdef CONFIG_PROC_FS ++static int veinfo_seq_show(struct seq_file *m, void *v) ++{ ++ struct ve_struct *ve = (struct ve_struct *)v; ++ struct list_head *tmp; ++ ++ seq_printf(m, "%10u %5u %5u", ve->veid, ++ ve->class_id, atomic_read(&ve->pcounter)); ++ read_lock(&veip_hash_lock); ++ if (ve->veip == NULL) ++ goto unlock; ++ list_for_each(tmp, &ve->veip->ip_lh) { ++ char ip[16]; ++ struct ip_entry_struct *entry; ++ ++ entry = list_entry(tmp, struct ip_entry_struct, ve_list); ++ if (entry->active_env == NULL) ++ continue; ++ ++ sprintf(ip, "%u.%u.%u.%u", NIPQUAD(entry->ip)); ++ seq_printf(m, " %15s", ip); ++ } ++unlock: ++ read_unlock(&veip_hash_lock); ++ seq_putc(m, '\n'); ++ return 0; ++} ++ ++static void *ve_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ struct ve_struct *ve, *curve; ++ loff_t l; ++ ++ curve = get_exec_env(); ++ read_lock(&ve_list_guard); ++ if (!ve_is_super(curve)) { ++ if (*pos != 0) ++ return NULL; ++ return curve; ++ } ++ for (ve = ve_list_head, l = *pos; ++ ve != NULL && l > 0; ++ ve = ve->next, l--); ++ return ve; ++} ++ ++static void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ struct ve_struct *ve = (struct ve_struct *)v; ++ ++ if (!ve_is_super(get_exec_env())) ++ return NULL; ++ (*pos)++; ++ return ve->next; ++} ++ ++static void ve_seq_stop(struct seq_file *m, void *v) ++{ ++ read_unlock(&ve_list_guard); ++} ++ ++ ++static struct seq_operations veinfo_seq_op = { ++ start: ve_seq_start, ++ next: ve_seq_next, ++ stop: ve_seq_stop, ++ show: veinfo_seq_show ++}; ++ ++static int veinfo_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &veinfo_seq_op); ++} ++ ++static struct file_operations proc_veinfo_operations = { ++ open: veinfo_open, ++ read: seq_read, ++ llseek: seq_lseek, ++ release: seq_release ++}; ++ ++static void *veip_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ loff_t l; ++ struct list_head *p; ++ int i; ++ ++ l = *pos; ++ write_lock_irq(&veip_hash_lock); ++ if (l == 0) ++ return ip_entry_hash_table; ++ for (i = 0; i < VEIP_HASH_SZ; i++) { ++ list_for_each(p, ip_entry_hash_table + i) { ++ if (--l == 0) ++ return p; ++ } ++ } ++ return NULL; ++} ++ ++static void *veip_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ struct list_head *p; ++ ++ p = (struct list_head *)v; ++ while (1) { ++ p = p->next; ++ if (p < ip_entry_hash_table || ++ p >= ip_entry_hash_table + VEIP_HASH_SZ) { ++ (*pos)++; ++ return p; ++ } ++ if (++p >= ip_entry_hash_table + VEIP_HASH_SZ) ++ return NULL; ++ } ++ return NULL; ++} ++ ++static void veip_seq_stop(struct seq_file *m, void *v) ++{ ++ write_unlock_irq(&veip_hash_lock); ++} ++ ++static struct seq_operations veip_seq_op = { ++ start: veip_seq_start, ++ next: veip_seq_next, ++ stop: veip_seq_stop, ++ show: veip_seq_show ++}; ++ ++static int veip_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &veip_seq_op); ++} ++ ++static struct file_operations proc_veip_operations = { ++ open: veip_open, ++ read: seq_read, ++ llseek: seq_lseek, ++ release: seq_release ++}; ++#endif ++ ++int real_ve_ip_map(envid_t veid, int op, struct sockaddr *uservaddr, int addrlen) ++{ ++ int err; ++ struct sockaddr_in addr; ++ struct ve_struct *ve; ++ ++ err = -EPERM; ++ if (!capable(CAP_SETVEID)) ++ goto out; ++ ++ err = -EINVAL; ++ if (addrlen != sizeof(struct sockaddr_in)) ++ goto out; ++ ++ err = move_addr_to_kernel(uservaddr, addrlen, &addr); ++ if (err < 0) ++ goto out; ++ ++ switch (op) ++ { ++ case VE_IP_ADD: ++ ve = get_ve_by_id(veid); ++ err = -ESRCH; ++ if (!ve) ++ goto out; ++ ++ down_read(&ve->op_sem); ++ if (ve->is_running) ++ err = veip_entry_add(ve, &addr); ++ up_read(&ve->op_sem); ++ put_ve(ve); ++ break; ++ ++ case VE_IP_DEL: ++ err = veip_entry_del(veid, &addr); ++ break; ++ default: ++ err = -EINVAL; ++ } ++ ++out: ++ return err; ++} ++ ++int venet_ioctl(struct inode *ino, struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err; ++ ++ err = -ENOTTY; ++ switch(cmd) { ++ case VENETCTL_VE_IP_MAP: { ++ struct vzctl_ve_ip_map s; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void *)arg, sizeof(s))) ++ break; ++ err = real_ve_ip_map(s.veid, s.op, s.addr, s.addrlen); ++ } ++ break; ++ } ++ return err; ++} ++ ++static struct vzioctlinfo venetcalls = { ++ type: VENETCTLTYPE, ++ func: venet_ioctl, ++ owner: THIS_MODULE, ++}; ++ ++int venet_dev_start(struct ve_struct *env) ++{ ++ struct net_device *dev_venet; ++ int err; ++ ++ dev_venet = alloc_netdev(0, "venet%d", venet_setup); ++ if (!dev_venet) ++ return -ENOMEM; ++ err = dev_alloc_name(dev_venet, dev_venet->name); ++ if (err<0) ++ goto err; ++ if ((err = register_netdev(dev_venet)) != 0) ++ goto err; ++ env->_venet_dev = dev_venet; ++ return 0; ++err: ++ free_netdev(dev_venet); ++ printk(KERN_ERR "VENET initialization error err=%d\n", err); ++ return err; ++} ++ ++static int venet_start(unsigned int hooknum, void *data) ++{ ++ struct ve_struct *env; ++ int err; ++ ++ env = ((struct ve_hook_init_data *)data)->env; ++ if (env->veip) ++ return -EEXIST; ++ if (!ve_is_super(env) && !try_module_get(THIS_MODULE)) ++ return 0; ++ ++ err = veip_start(env); ++ if (err) ++ goto err; ++ ++ err = venet_dev_start(env); ++ if (err) ++ goto err_free; ++ return 0; ++ ++err_free: ++ veip_stop(env); ++err: ++ if (!ve_is_super(env)) ++ module_put(THIS_MODULE); ++ return err; ++} ++ ++static int venet_stop(unsigned int hooknum, void *data) ++{ ++ struct ve_struct *env; ++ ++ if (hooknum == VE_HOOK_INIT) ++ env = ((struct ve_hook_init_data *)data)->env; ++ else ++ env = (struct ve_struct *)data; ++ veip_stop(env); ++ if (!ve_is_super(env)) ++ module_put(THIS_MODULE); ++ return 0; ++} ++ ++#define VE_HOOK_PRI_NET 0 ++ ++static struct ve_hook venet_ve_hook_init = { ++ hook: venet_start, ++ undo: venet_stop, ++ hooknum: VE_HOOK_INIT, ++ priority: VE_HOOK_PRI_NET ++}; ++ ++static struct ve_hook venet_ve_hook_fini = { ++ hook: venet_stop, ++ hooknum: VE_HOOK_FINI, ++ priority: VE_HOOK_PRI_NET ++}; ++ ++__init int venet_init(void) ++{ ++ struct ve_hook_init_data vhd; ++#ifdef CONFIG_PROC_FS ++ struct proc_dir_entry *de; ++#endif ++ int i, err; ++ ++ if (get_ve0()->_venet_dev != NULL) ++ return -EEXIST; ++ ++ for (i = 0; i < VEIP_HASH_SZ; i++) ++ INIT_LIST_HEAD(ip_entry_hash_table + i); ++ ++ vhd.env = get_ve0(); ++ err = venet_start(VE_HOOK_INIT, (void *)&vhd); ++ if (err) ++ return err; ++ ++#ifdef CONFIG_PROC_FS ++ de = create_proc_glob_entry("vz/veinfo", ++ S_IFREG|S_IRUSR, NULL); ++ if (de) ++ de->proc_fops = &proc_veinfo_operations; ++ else ++ printk(KERN_WARNING "venet: can't make veinfo proc entry\n"); ++ ++ de = create_proc_entry("vz/veip", S_IFREG|S_IRUSR, NULL); ++ if (de) ++ de->proc_fops = &proc_veip_operations; ++ else ++ printk(KERN_WARNING "venet: can't make veip proc entry\n"); ++#endif ++ ++ ve_hook_register(&venet_ve_hook_init); ++ ve_hook_register(&venet_ve_hook_fini); ++ vzioctl_register(&venetcalls); ++ return 0; ++} ++ ++__exit void venet_exit(void) ++{ ++ struct net_device *dev_venet; ++ ++ vzioctl_unregister(&venetcalls); ++ ve_hook_unregister(&venet_ve_hook_fini); ++ ve_hook_unregister(&venet_ve_hook_init); ++#ifdef CONFIG_PROC_FS ++ remove_proc_entry("vz/veip", NULL); ++ remove_proc_entry("vz/veinfo", NULL); ++#endif ++ ++ dev_venet = get_ve0()->_venet_dev; ++ if (dev_venet != NULL) { ++ get_ve0()->_venet_dev = NULL; ++ unregister_netdev(dev_venet); ++ free_netdev(dev_venet); ++ } ++ veip_stop(get_ve0()); ++} ++ ++module_init(venet_init); ++module_exit(venet_exit); +diff -uprN linux-2.6.15.orig/drivers/pci/probe.c linux-2.6.15-ve025stab014/drivers/pci/probe.c +--- linux-2.6.15.orig/drivers/pci/probe.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/drivers/pci/probe.c 2006-01-27 14:48:08.000000000 +0300 +@@ -21,6 +21,7 @@ LIST_HEAD(pci_root_buses); + EXPORT_SYMBOL(pci_root_buses); + + LIST_HEAD(pci_devices); ++EXPORT_SYMBOL(pci_devices); + + #ifdef HAVE_PCI_LEGACY + /** +diff -uprN linux-2.6.15.orig/drivers/s390/cio/cio.c linux-2.6.15-ve025stab014/drivers/s390/cio/cio.c +--- linux-2.6.15.orig/drivers/s390/cio/cio.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/drivers/s390/cio/cio.c 2006-01-27 14:48:08.000000000 +0300 +@@ -604,7 +604,11 @@ do_IRQ (struct pt_regs *regs) + struct tpi_info *tpi_info; + struct subchannel *sch; + struct irb *irb; ++ struct ve_struct *ve; ++ struct user_beancounter *ub; + ++ ve = set_exec_env(get_ve0()); ++ ub = set_exec_ub(get_ub0()); + irq_enter (); + asm volatile ("mc 0,0"); + if (S390_lowcore.int_clock >= S390_lowcore.jiffy_timer) +@@ -651,6 +655,8 @@ do_IRQ (struct pt_regs *regs) + */ + } while (!MACHINE_IS_VM && tpi (NULL) != 0); + irq_exit (); ++ (void)set_exec_ub(ub); ++ (void)set_exec_env(ve); + } + + #ifdef CONFIG_CCW_CONSOLE +diff -uprN linux-2.6.15.orig/fs/Kconfig linux-2.6.15-ve025stab014/fs/Kconfig +--- linux-2.6.15.orig/fs/Kconfig 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/Kconfig 2006-01-27 14:48:08.000000000 +0300 +@@ -403,6 +403,38 @@ config QFMT_V2 + This quota format allows using quotas with 32-bit UIDs/GIDs. If you + need this functionality say Y here. + ++config SIM_FS ++ tristate "VPS filesystem" ++ depends on VZ_QUOTA ++ default m ++ help ++ This file system is a part of Virtuozzo. It intoduces a fake ++ superblock and blockdev to VE to hide real device and show ++ statfs results taken from quota. ++ ++config VZ_QUOTA ++ tristate "Virtuozzo Disk Quota support" ++ depends on QUOTA ++ default m ++ help ++ Virtuozzo Disk Quota imposes disk quota on directories with their ++ files and subdirectories in total. Such disk quota is used to ++ account and limit disk usage by Virtuozzo VPS, but also may be used ++ separately. ++ ++config VZ_QUOTA_UNLOAD ++ bool "Unloadable Virtuozzo Disk Quota module" ++ depends on VZ_QUOTA=m ++ default n ++ help ++ Make Virtuozzo Disk Quota module unloadable. ++ Doesn't work reliably now. ++ ++config VZ_QUOTA_UGID ++ bool "Per-user and per-group quota in Virtuozzo quota partitions" ++ depends on VZ_QUOTA!=n ++ default y ++ + config QUOTACTL + bool + depends on XFS_QUOTA || QUOTA +diff -uprN linux-2.6.15.orig/fs/Makefile linux-2.6.15-ve025stab014/fs/Makefile +--- linux-2.6.15.orig/fs/Makefile 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/Makefile 2006-01-27 14:48:08.000000000 +0300 +@@ -39,9 +39,15 @@ obj-$(CONFIG_QUOTA) += dquot.o + obj-$(CONFIG_QFMT_V1) += quota_v1.o + obj-$(CONFIG_QFMT_V2) += quota_v2.o + obj-$(CONFIG_QUOTACTL) += quota.o ++obj-$(CONFIG_VZ_QUOTA) += vzdquota.o ++vzdquota-y += vzdquot.o vzdq_mgmt.o vzdq_ops.o vzdq_tree.o ++vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_ugid.o ++vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_file.o + + obj-$(CONFIG_DNOTIFY) += dnotify.o + ++obj-$(CONFIG_SIM_FS) += simfs.o ++ + obj-$(CONFIG_PROC_FS) += proc/ + obj-y += partitions/ + obj-$(CONFIG_SYSFS) += sysfs/ +diff -uprN linux-2.6.15.orig/fs/binfmt_elf.c linux-2.6.15-ve025stab014/fs/binfmt_elf.c +--- linux-2.6.15.orig/fs/binfmt_elf.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/binfmt_elf.c 2006-01-27 14:48:08.000000000 +0300 +@@ -355,7 +355,7 @@ static unsigned long load_elf_interp(str + eppnt = elf_phdata; + for (i=0; i<interp_elf_ex->e_phnum; i++, eppnt++) { + if (eppnt->p_type == PT_LOAD) { +- int elf_type = MAP_PRIVATE | MAP_DENYWRITE; ++ int elf_type = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECPRIO; + int elf_prot = 0; + unsigned long vaddr = 0; + unsigned long k, map_addr; +@@ -828,7 +828,7 @@ static int load_elf_binary(struct linux_ + if (elf_ppnt->p_flags & PF_W) elf_prot |= PROT_WRITE; + if (elf_ppnt->p_flags & PF_X) elf_prot |= PROT_EXEC; + +- elf_flags = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE; ++ elf_flags = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE|MAP_EXECPRIO; + + vaddr = elf_ppnt->p_vaddr; + if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { +@@ -1270,10 +1270,10 @@ static void fill_prstatus(struct elf_prs + prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; + prstatus->pr_sigpend = p->pending.signal.sig[0]; + prstatus->pr_sighold = p->blocked.sig[0]; +- prstatus->pr_pid = p->pid; +- prstatus->pr_ppid = p->parent->pid; +- prstatus->pr_pgrp = process_group(p); +- prstatus->pr_sid = p->signal->session; ++ prstatus->pr_pid = virt_pid(p); ++ prstatus->pr_ppid = virt_pid(p->parent); ++ prstatus->pr_pgrp = virt_pgid(p); ++ prstatus->pr_sid = virt_sid(p); + if (thread_group_leader(p)) { + /* + * This is the record for the group leader. Add in the +@@ -1316,10 +1316,10 @@ static int fill_psinfo(struct elf_prpsin + psinfo->pr_psargs[i] = ' '; + psinfo->pr_psargs[len] = 0; + +- psinfo->pr_pid = p->pid; +- psinfo->pr_ppid = p->parent->pid; +- psinfo->pr_pgrp = process_group(p); +- psinfo->pr_sid = p->signal->session; ++ psinfo->pr_pid = virt_pid(p); ++ psinfo->pr_ppid = virt_pid(p->parent); ++ psinfo->pr_pgrp = virt_pgid(p); ++ psinfo->pr_sid = virt_sid(p); + + i = p->state ? ffz(~p->state) + 1 : 0; + psinfo->pr_state = i; +@@ -1452,7 +1452,7 @@ static int elf_core_dump(long signr, str + if (signr) { + struct elf_thread_status *tmp; + read_lock(&tasklist_lock); +- do_each_thread(g,p) ++ do_each_thread_ve(g,p) + if (current->mm == p->mm && current != p) { + tmp = kmalloc(sizeof(*tmp), GFP_ATOMIC); + if (!tmp) { +@@ -1464,7 +1464,7 @@ static int elf_core_dump(long signr, str + tmp->thread = p; + list_add(&tmp->list, &thread_list); + } +- while_each_thread(g,p); ++ while_each_thread_ve(g,p); + read_unlock(&tasklist_lock); + list_for_each(t, &thread_list) { + struct elf_thread_status *tmp; +diff -uprN linux-2.6.15.orig/fs/block_dev.c linux-2.6.15-ve025stab014/fs/block_dev.c +--- linux-2.6.15.orig/fs/block_dev.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/block_dev.c 2006-01-27 14:48:08.000000000 +0300 +@@ -561,9 +561,16 @@ static int do_open(struct block_device * + { + struct module *owner = NULL; + struct gendisk *disk; +- int ret = -ENXIO; ++ int ret; + int part; + ++#ifdef CONFIG_VE ++ ret = get_device_perms_ve(S_IFBLK, bdev->bd_dev, ++ file->f_mode&(FMODE_READ|FMODE_WRITE)); ++ if (ret) ++ return ret; ++#endif ++ ret = -ENXIO; + file->f_mapping = bdev->bd_inode->i_mapping; + lock_kernel(); + disk = get_gendisk(bdev->bd_dev, &part); +@@ -832,7 +839,7 @@ EXPORT_SYMBOL(ioctl_by_bdev); + * namespace if possible and return it. Return ERR_PTR(error) + * otherwise. + */ +-struct block_device *lookup_bdev(const char *path) ++struct block_device *lookup_bdev(const char *path, int mode) + { + struct block_device *bdev; + struct inode *inode; +@@ -850,6 +857,11 @@ struct block_device *lookup_bdev(const c + error = -ENOTBLK; + if (!S_ISBLK(inode->i_mode)) + goto fail; ++#ifdef CONFIG_VE ++ error = get_device_perms_ve(S_IFBLK, inode->i_rdev, mode); ++ if (error) ++ goto fail; ++#endif + error = -EACCES; + if (nd.mnt->mnt_flags & MNT_NODEV) + goto fail; +@@ -881,12 +893,13 @@ struct block_device *open_bdev_excl(cons + mode_t mode = FMODE_READ; + int error = 0; + +- bdev = lookup_bdev(path); ++ if (!(flags & MS_RDONLY)) ++ mode |= FMODE_WRITE; ++ ++ bdev = lookup_bdev(path, mode); + if (IS_ERR(bdev)) + return bdev; + +- if (!(flags & MS_RDONLY)) +- mode |= FMODE_WRITE; + error = blkdev_get(bdev, mode, 0); + if (error) + return ERR_PTR(error); +diff -uprN linux-2.6.15.orig/fs/char_dev.c linux-2.6.15-ve025stab014/fs/char_dev.c +--- linux-2.6.15.orig/fs/char_dev.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/char_dev.c 2006-01-27 14:48:08.000000000 +0300 +@@ -292,6 +292,13 @@ int chrdev_open(struct inode * inode, st + struct cdev *new = NULL; + int ret = 0; + ++#ifdef CONFIG_VE ++ ret = get_device_perms_ve(S_IFCHR, inode->i_rdev, ++ filp->f_mode&(FMODE_READ|FMODE_WRITE)); ++ if (ret) ++ return ret; ++#endif ++ + spin_lock(&cdev_lock); + p = inode->i_cdev; + if (!p) { +diff -uprN linux-2.6.15.orig/fs/dcache.c linux-2.6.15-ve025stab014/fs/dcache.c +--- linux-2.6.15.orig/fs/dcache.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/dcache.c 2006-01-27 14:48:08.000000000 +0300 +@@ -28,11 +28,15 @@ + #include <linux/module.h> + #include <linux/mount.h> + #include <linux/file.h> ++#include <linux/namei.h> + #include <asm/uaccess.h> + #include <linux/security.h> + #include <linux/seqlock.h> + #include <linux/swap.h> + #include <linux/bootmem.h> ++#include <linux/kernel_stat.h> ++ ++#include <ub/ub_dcache.h> + + /* #define DCACHE_DEBUG 1 */ + +@@ -44,7 +48,7 @@ static seqlock_t rename_lock __cacheline + + EXPORT_SYMBOL(dcache_lock); + +-static kmem_cache_t *dentry_cache; ++kmem_cache_t *dentry_cache; + + #define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname)) + +@@ -114,6 +118,75 @@ static inline void dentry_iput(struct de + } + } + ++struct dcache_shrinker { ++ struct list_head list; ++ struct dentry *dentry; ++}; ++ ++DECLARE_WAIT_QUEUE_HEAD(dcache_shrinker_wq); ++ ++/* called under dcache_lock */ ++static void dcache_shrinker_add(struct dcache_shrinker *ds, ++ struct dentry *parent, struct dentry *dentry) ++{ ++ struct super_block *sb; ++ ++ sb = parent->d_sb; ++ ds->dentry = parent; ++ list_add(&ds->list, &sb->s_dshrinkers); ++} ++ ++/* called under dcache_lock */ ++static void dcache_shrinker_del(struct dcache_shrinker *ds) ++{ ++ if (ds == NULL || list_empty(&ds->list)) ++ return; ++ ++ list_del_init(&ds->list); ++ wake_up_all(&dcache_shrinker_wq); ++} ++ ++/* called under dcache_lock, drops inside */ ++static void dcache_shrinker_wait(struct super_block *sb) ++{ ++ DECLARE_WAITQUEUE(wq, current); ++ ++ __set_current_state(TASK_UNINTERRUPTIBLE); ++ add_wait_queue(&dcache_shrinker_wq, &wq); ++ spin_unlock(&dcache_lock); ++ ++ schedule(); ++ remove_wait_queue(&dcache_shrinker_wq, &wq); ++ __set_current_state(TASK_RUNNING); ++} ++ ++void dcache_shrinker_wait_sb(struct super_block *sb) ++{ ++ /* the root dentry can be held in dput_recursive */ ++ spin_lock(&dcache_lock); ++ while (!list_empty(&sb->s_dshrinkers)) { ++ dcache_shrinker_wait(sb); ++ spin_lock(&dcache_lock); ++ } ++ spin_unlock(&dcache_lock); ++} ++ ++/* dcache_lock protects shrinker's list */ ++static void shrink_dcache_racecheck(struct dentry *parent, int *racecheck) ++{ ++ struct super_block *sb; ++ struct dcache_shrinker *ds; ++ ++ sb = parent->d_sb; ++ list_for_each_entry(ds, &sb->s_dshrinkers, list) { ++ /* is one of dcache shrinkers working on the dentry? */ ++ if (ds->dentry == parent) { ++ *racecheck = 1; ++ break; ++ } ++ } ++} ++ + /* + * This is dput + * +@@ -132,8 +205,9 @@ static inline void dentry_iput(struct de + */ + + /* +- * dput - release a dentry +- * @dentry: dentry to release ++ * dput_recursive - go upward through the dentry tree and release dentries ++ * @dentry: starting dentry ++ * @ds: shrinker to be added to active list (see shrink_dcache_parent) + * + * Release a dentry. This will drop the usage count and if appropriate + * call the dentry unlink method as well as removing it from the queues and +@@ -142,18 +216,15 @@ static inline void dentry_iput(struct de + * + * no dcache lock, please. + */ +- +-void dput(struct dentry *dentry) ++static void dput_recursive(struct dentry *dentry, struct dcache_shrinker *ds) + { +- if (!dentry) +- return; +- +-repeat: + if (atomic_read(&dentry->d_count) == 1) + might_sleep(); + if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) + return; ++ dcache_shrinker_del(ds); + ++repeat: + spin_lock(&dentry->d_lock); + if (atomic_read(&dentry->d_count)) { + spin_unlock(&dentry->d_lock); +@@ -185,6 +256,7 @@ unhash_it: + + kill_it: { + struct dentry *parent; ++ struct dcache_shrinker lds; + + /* If dentry was on d_lru list + * delete it from there +@@ -194,18 +266,50 @@ kill_it: { + dentry_stat.nr_unused--; + } + list_del(&dentry->d_child); ++ parent = dentry->d_parent; ++ dcache_shrinker_add(&lds, parent, dentry); + dentry_stat.nr_dentry--; /* For d_free, below */ + /*drops the locks, at that point nobody can reach this dentry */ + dentry_iput(dentry); +- parent = dentry->d_parent; + d_free(dentry); +- if (dentry == parent) ++ if (unlikely(dentry == parent)) { ++ spin_lock(&dcache_lock); ++ dcache_shrinker_del(&lds); ++ spin_unlock(&dcache_lock); + return; ++ } + dentry = parent; +- goto repeat; ++ spin_lock(&dcache_lock); ++ dcache_shrinker_del(&lds); ++ if (atomic_dec_and_test(&dentry->d_count)) ++ goto repeat; ++ spin_unlock(&dcache_lock); + } + } + ++/* ++ * dput - release a dentry ++ * @dentry: dentry to release ++ * ++ * Release a dentry. This will drop the usage count and if appropriate ++ * call the dentry unlink method as well as removing it from the queues and ++ * releasing its resources. If the parent dentries were scheduled for release ++ * they too may now get deleted. ++ * ++ * no dcache lock, please. ++ */ ++ ++void dput(struct dentry *dentry) ++{ ++ if (!dentry) ++ return; ++ ++ spin_lock(&dcache_lock); ++ ub_dentry_uncharge(dentry); ++ spin_unlock(&dcache_lock); ++ dput_recursive(dentry, NULL); ++} ++ + /** + * d_invalidate - invalidate a dentry + * @dentry: dentry to invalidate +@@ -272,6 +376,8 @@ static inline struct dentry * __dget_loc + dentry_stat.nr_unused--; + list_del_init(&dentry->d_lru); + } ++ ++ ub_dentry_charge_nofail(dentry); + return dentry; + } + +@@ -362,19 +468,27 @@ restart: + * removed. + * Called with dcache_lock, drops it and then regains. + */ +-static inline void prune_one_dentry(struct dentry * dentry) ++static void prune_one_dentry(struct dentry * dentry) + { + struct dentry * parent; ++ struct dcache_shrinker ds; + + __d_drop(dentry); + list_del(&dentry->d_child); ++ parent = dentry->d_parent; ++ dcache_shrinker_add(&ds, parent, dentry); + dentry_stat.nr_dentry--; /* For d_free, below */ + dentry_iput(dentry); + parent = dentry->d_parent; + d_free(dentry); + if (parent != dentry) +- dput(parent); ++ /* ++ * dentry is not in use, only child (not outside) ++ * references change, so parent->d_inuse does not change ++ */ ++ dput_recursive(parent, &ds); + spin_lock(&dcache_lock); ++ dcache_shrinker_del(&ds); + } + + /** +@@ -557,13 +671,12 @@ positive: + * drop the lock and return early due to latency + * constraints. + */ +-static int select_parent(struct dentry * parent) ++static int select_parent(struct dentry * parent, int * racecheck) + { + struct dentry *this_parent = parent; + struct list_head *next; + int found = 0; + +- spin_lock(&dcache_lock); + repeat: + next = this_parent->d_subdirs.next; + resume: +@@ -605,6 +718,9 @@ dentry->d_parent->d_name.name, dentry->d + #endif + goto repeat; + } ++ ++ if (!found && racecheck != NULL) ++ shrink_dcache_racecheck(dentry, racecheck); + } + /* + * All done at this level ... ascend and resume the search. +@@ -619,7 +735,6 @@ this_parent->d_parent->d_name.name, this + goto resume; + } + out: +- spin_unlock(&dcache_lock); + return found; + } + +@@ -632,10 +747,66 @@ out: + + void shrink_dcache_parent(struct dentry * parent) + { +- int found; ++ int found, r; ++ ++ while (1) { ++ spin_lock(&dcache_lock); ++ found = select_parent(parent, NULL); ++ if (found) ++ goto found; + +- while ((found = select_parent(parent)) != 0) ++ /* ++ * try again with a dput_recursive() race check. ++ * it returns quickly if everything was really shrinked ++ */ ++ r = 0; ++ found = select_parent(parent, &r); ++ if (found) ++ goto found; ++ if (!r) ++ break; ++ ++ /* drops the lock inside */ ++ dcache_shrinker_wait(parent->d_sb); ++ continue; ++ ++found: ++ spin_unlock(&dcache_lock); + prune_dcache(found); ++ } ++ spin_unlock(&dcache_lock); ++} ++ ++/* ++ * Move any unused anon dentries to the end of the unused list. ++ * called under dcache_lock ++ */ ++static int select_anon(struct hlist_head *head, int *racecheck) ++{ ++ struct hlist_node *lp; ++ int found = 0; ++ ++ hlist_for_each(lp, head) { ++ struct dentry *this = hlist_entry(lp, struct dentry, d_hash); ++ if (!list_empty(&this->d_lru)) { ++ dentry_stat.nr_unused--; ++ list_del_init(&this->d_lru); ++ } ++ ++ /* ++ * move only zero ref count dentries to the end ++ * of the unused list for prune_dcache ++ */ ++ if (!atomic_read(&this->d_count)) { ++ list_add_tail(&this->d_lru, &dentry_unused); ++ dentry_stat.nr_unused++; ++ found++; ++ } ++ ++ if (!found && racecheck != NULL) ++ shrink_dcache_racecheck(this, racecheck); ++ } ++ return found; + } + + /** +@@ -648,33 +819,36 @@ void shrink_dcache_parent(struct dentry + * done under dcache_lock. + * + */ +-void shrink_dcache_anon(struct hlist_head *head) ++void shrink_dcache_anon(struct super_block *sb) + { +- struct hlist_node *lp; +- int found; +- do { +- found = 0; ++ int found, r; ++ ++ while (1) { + spin_lock(&dcache_lock); +- hlist_for_each(lp, head) { +- struct dentry *this = hlist_entry(lp, struct dentry, d_hash); +- if (!list_empty(&this->d_lru)) { +- dentry_stat.nr_unused--; +- list_del_init(&this->d_lru); +- } ++ found = select_anon(&sb->s_anon, NULL); ++ if (found) ++ goto found; + +- /* +- * move only zero ref count dentries to the end +- * of the unused list for prune_dcache +- */ +- if (!atomic_read(&this->d_count)) { +- list_add_tail(&this->d_lru, &dentry_unused); +- dentry_stat.nr_unused++; +- found++; +- } +- } ++ /* ++ * try again with a dput_recursive() race check. ++ * it returns quickly if everything was really shrinked ++ */ ++ r = 0; ++ found = select_anon(&sb->s_anon, &r); ++ if (found) ++ goto found; ++ if (!r) ++ break; ++ ++ /* drops the lock inside */ ++ dcache_shrinker_wait(sb); ++ continue; ++ ++found: + spin_unlock(&dcache_lock); + prune_dcache(found); +- } while(found); ++ } ++ spin_unlock(&dcache_lock); + } + + /* +@@ -691,12 +865,18 @@ void shrink_dcache_anon(struct hlist_hea + */ + static int shrink_dcache_memory(int nr, gfp_t gfp_mask) + { ++ int res = -1; ++ ++ KSTAT_PERF_ENTER(shrink_dcache) + if (nr) { + if (!(gfp_mask & __GFP_FS)) +- return -1; ++ goto out; + prune_dcache(nr); + } +- return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; ++ res = (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; ++out: ++ KSTAT_PERF_LEAVE(shrink_dcache) ++ return res; + } + + /** +@@ -716,19 +896,20 @@ struct dentry *d_alloc(struct dentry * p + + dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); + if (!dentry) +- return NULL; ++ goto err_alloc; + + if (name->len > DNAME_INLINE_LEN-1) { + dname = kmalloc(name->len + 1, GFP_KERNEL); +- if (!dname) { +- kmem_cache_free(dentry_cache, dentry); +- return NULL; +- } ++ if (!dname) ++ goto err_name; + } else { + dname = dentry->d_iname; + } + dentry->d_name.name = dname; + ++ if (ub_dentry_alloc(dentry)) ++ goto err_charge; ++ + dentry->d_name.len = name->len; + dentry->d_name.hash = name->hash; + memcpy(dname, name->name, name->len); +@@ -757,12 +938,23 @@ struct dentry *d_alloc(struct dentry * p + } + + spin_lock(&dcache_lock); +- if (parent) ++ if (parent) { + list_add(&dentry->d_child, &parent->d_subdirs); ++ if (parent->d_flags & DCACHE_VIRTUAL) ++ dentry->d_flags |= DCACHE_VIRTUAL; ++ } + dentry_stat.nr_dentry++; + spin_unlock(&dcache_lock); + + return dentry; ++ ++err_charge: ++ if (name->len > DNAME_INLINE_LEN - 1) ++ kfree(dname); ++err_name: ++ kmem_cache_free(dentry_cache, dentry); ++err_alloc: ++ return NULL; + } + + struct dentry *d_alloc_name(struct dentry *parent, const char *name) +@@ -1041,7 +1233,6 @@ struct dentry * __d_lookup(struct dentry + unsigned int hash = name->hash; + const unsigned char *str = name->name; + struct hlist_head *head = d_hash(parent,hash); +- struct dentry *found = NULL; + struct hlist_node *node; + struct dentry *dentry; + +@@ -1082,7 +1273,7 @@ struct dentry * __d_lookup(struct dentry + + if (!d_unhashed(dentry)) { + atomic_inc(&dentry->d_count); +- found = dentry; ++ goto found; + } + spin_unlock(&dentry->d_lock); + break; +@@ -1091,7 +1282,18 @@ next: + } + rcu_read_unlock(); + +- return found; ++ return NULL; ++ ++found: ++ /* ++ * d_lock and rcu_read_lock ++ * are dropped in ub_dentry_charge() ++ */ ++ if (ub_dentry_charge(dentry)) { ++ dput(dentry); ++ dentry = NULL; ++ } ++ return dentry; + } + + /** +@@ -1338,6 +1540,32 @@ already_unhashed: + } + + /** ++ * __d_path_add_deleted - prepend "(deleted) " text ++ * @end: a pointer to the character after free space at the beginning of the ++ * buffer ++ * @buflen: remaining free space ++ */ ++static inline char * __d_path_add_deleted(char * end, int buflen) ++{ ++ buflen -= 10; ++ if (buflen < 0) ++ return ERR_PTR(-ENAMETOOLONG); ++ end -= 10; ++ memcpy(end, "(deleted) ", 10); ++ return end; ++} ++ ++/** ++ * d_root_check - checks if dentry is accessible from current's fs root ++ * @dentry: dentry to be verified ++ * @vfsmnt: vfsmnt to which the dentry belongs ++ */ ++int d_root_check(struct dentry *dentry, struct vfsmount *vfsmnt) ++{ ++ return PTR_ERR(d_path(dentry, vfsmnt, NULL, 0)); ++} ++ ++/** + * d_path - return the path of a dentry + * @dentry: dentry to report + * @vfsmnt: vfsmnt to which the dentry belongs +@@ -1358,36 +1586,35 @@ static char * __d_path( struct dentry *d + char *buffer, int buflen) + { + char * end = buffer+buflen; +- char * retval; ++ char * retval = NULL; + int namelen; ++ int deleted; ++ struct vfsmount *oldvfsmnt; + +- *--end = '\0'; +- buflen--; +- if (!IS_ROOT(dentry) && d_unhashed(dentry)) { +- buflen -= 10; +- end -= 10; +- if (buflen < 0) ++ oldvfsmnt = vfsmnt; ++ deleted = (!IS_ROOT(dentry) && d_unhashed(dentry)); ++ if (buffer != NULL) { ++ *--end = '\0'; ++ buflen--; ++ ++ if (buflen < 1) + goto Elong; +- memcpy(end, " (deleted)", 10); ++ /* Get '/' right */ ++ retval = end-1; ++ *retval = '/'; + } + +- if (buflen < 1) +- goto Elong; +- /* Get '/' right */ +- retval = end-1; +- *retval = '/'; +- + for (;;) { + struct dentry * parent; + + if (dentry == root && vfsmnt == rootmnt) + break; + if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { +- /* Global root? */ ++ /* root of a tree? */ + spin_lock(&vfsmount_lock); + if (vfsmnt->mnt_parent == vfsmnt) { + spin_unlock(&vfsmount_lock); +- goto global_root; ++ goto other_root; + } + dentry = vfsmnt->mnt_mountpoint; + vfsmnt = vfsmnt->mnt_parent; +@@ -1396,27 +1623,51 @@ static char * __d_path( struct dentry *d + } + parent = dentry->d_parent; + prefetch(parent); ++ if (buffer != NULL) { ++ namelen = dentry->d_name.len; ++ buflen -= namelen + 1; ++ if (buflen < 0) ++ goto Elong; ++ end -= namelen; ++ memcpy(end, dentry->d_name.name, namelen); ++ *--end = '/'; ++ retval = end; ++ } ++ dentry = parent; ++ } ++ /* the given root point is reached */ ++finish: ++ if (buffer != NULL && deleted) ++ retval = __d_path_add_deleted(end, buflen); ++ return retval; ++ ++other_root: ++ /* ++ * We traversed the tree upward and reached a root, but the given ++ * lookup terminal point wasn't encountered. It means either that the ++ * dentry is out of our scope or belongs to an abstract space like ++ * sock_mnt or pipe_mnt. Check for it. ++ * ++ * There are different options to check it. ++ * We may assume that any dentry tree is unreachable unless it's ++ * connected to `root' (defined as fs root of init aka child reaper) ++ * and expose all paths that are not connected to it. ++ * The other option is to allow exposing of known abstract spaces ++ * explicitly and hide the path information for other cases. ++ * This approach is more safe, let's take it. 2001/04/22 SAW ++ */ ++ if (!(oldvfsmnt->mnt_sb->s_flags & MS_NOUSER)) ++ return ERR_PTR(-EINVAL); ++ if (buffer != NULL) { + namelen = dentry->d_name.len; +- buflen -= namelen + 1; ++ buflen -= namelen; + if (buflen < 0) + goto Elong; +- end -= namelen; +- memcpy(end, dentry->d_name.name, namelen); +- *--end = '/'; +- retval = end; +- dentry = parent; ++ retval -= namelen-1; /* hit the slash */ ++ memcpy(retval, dentry->d_name.name, namelen); + } ++ goto finish; + +- return retval; +- +-global_root: +- namelen = dentry->d_name.len; +- buflen -= namelen; +- if (buflen < 0) +- goto Elong; +- retval -= namelen-1; /* hit the slash */ +- memcpy(retval, dentry->d_name.name, namelen); +- return retval; + Elong: + return ERR_PTR(-ENAMETOOLONG); + } +@@ -1441,6 +1692,228 @@ char * d_path(struct dentry *dentry, str + return res; + } + ++#ifdef CONFIG_VE ++#include <net/sock.h> ++#include <linux/ip.h> ++#include <linux/file.h> ++#include <linux/namespace.h> ++#include <linux/vzratelimit.h> ++ ++static void mark_sub_tree_virtual(struct dentry *d) ++{ ++ struct dentry *orig_root; ++ ++ orig_root = d; ++ while (1) { ++ spin_lock(&d->d_lock); ++ d->d_flags |= DCACHE_VIRTUAL; ++ spin_unlock(&d->d_lock); ++ ++ if (!list_empty(&d->d_subdirs)) { ++ d = list_entry(d->d_subdirs.next, ++ struct dentry, d_child); ++ continue; ++ } ++ if (d == orig_root) ++ break; ++ while (d == list_entry(d->d_parent->d_subdirs.prev, ++ struct dentry, d_child)) { ++ d = d->d_parent; ++ if (d == orig_root) ++ goto out; ++ } ++ d = list_entry(d->d_child.next, ++ struct dentry, d_child); ++ } ++out: ++ return; ++} ++ ++void mark_tree_virtual(struct vfsmount *m, struct dentry *d) ++{ ++ struct vfsmount *orig_rootmnt; ++ ++ spin_lock(&dcache_lock); ++ spin_lock(&vfsmount_lock); ++ orig_rootmnt = m; ++ while (1) { ++ mark_sub_tree_virtual(d); ++ if (!list_empty(&m->mnt_mounts)) { ++ m = list_entry(m->mnt_mounts.next, ++ struct vfsmount, mnt_child); ++ d = m->mnt_root; ++ continue; ++ } ++ if (m == orig_rootmnt) ++ break; ++ while (m == list_entry(m->mnt_parent->mnt_mounts.prev, ++ struct vfsmount, mnt_child)) { ++ m = m->mnt_parent; ++ if (m == orig_rootmnt) ++ goto out; ++ } ++ m = list_entry(m->mnt_child.next, ++ struct vfsmount, mnt_child); ++ d = m->mnt_root; ++ } ++out: ++ spin_unlock(&vfsmount_lock); ++ spin_unlock(&dcache_lock); ++} ++EXPORT_SYMBOL(mark_tree_virtual); ++ ++static struct vz_rate_info area_ri = { 20, 10*HZ }; ++#define VE_AREA_ACC_CHECK 0x0001 ++#define VE_AREA_ACC_DENY 0x0002 ++#define VE_AREA_EXEC_CHECK 0x0010 ++#define VE_AREA_EXEC_DENY 0x0020 ++#define VE0_AREA_ACC_CHECK 0x0100 ++#define VE0_AREA_ACC_DENY 0x0200 ++#define VE0_AREA_EXEC_CHECK 0x1000 ++#define VE0_AREA_EXEC_DENY 0x2000 ++int ve_area_access_check = 0; ++ ++static void print_connection_info(struct task_struct *tsk) ++{ ++ struct files_struct *files; ++ struct fdtable *fdt; ++ int fd; ++ ++ files = get_files_struct(tsk); ++ if (!files) ++ return; ++ ++ spin_lock(&files->file_lock); ++ fdt = files_fdtable(files); ++ for (fd = 0; fd < fdt->max_fds; fd++) { ++ struct file *file; ++ struct inode *inode; ++ struct socket *socket; ++ struct sock *sk; ++ struct inet_sock *inet; ++ ++ file = fdt->fd[fd]; ++ if (file == NULL) ++ continue; ++ ++ inode = file->f_dentry->d_inode; ++ if (!S_ISSOCK(inode->i_mode)) ++ continue; ++ ++ socket = SOCKET_I(inode); ++ if (socket == NULL) ++ continue; ++ ++ sk = socket->sk; ++ if (sk->sk_family != PF_INET || sk->sk_type != SOCK_STREAM) ++ continue; ++ ++ inet = inet_sk(sk); ++ printk(KERN_ALERT "connection from %u.%u.%u.%u:%u to port %u\n", ++ NIPQUAD(inet->daddr), ntohs(inet->dport), ++ inet->num); ++ } ++ spin_unlock(&files->file_lock); ++ put_files_struct(files); ++} ++ ++static void check_alert(struct vfsmount *vfsmnt, struct dentry *dentry, ++ char *str) ++{ ++ struct task_struct *tsk; ++ unsigned long page; ++ struct super_block *sb; ++ char *p; ++ ++ if (!vz_ratelimit(&area_ri)) ++ return; ++ ++ tsk = current; ++ p = ERR_PTR(-ENOMEM); ++ page = __get_free_page(GFP_KERNEL); ++ if (page) { ++ spin_lock(&dcache_lock); ++ p = __d_path(dentry, vfsmnt, tsk->fs->root, tsk->fs->rootmnt, ++ (char *)page, PAGE_SIZE); ++ spin_unlock(&dcache_lock); ++ } ++ if (IS_ERR(p)) ++ p = "(undefined)"; ++ ++ sb = dentry->d_sb; ++ printk(KERN_ALERT "%s check alert! file:[%s] from %d/%s, dev%x\n" ++ "Task %d/%d[%s] from VE%d, execenv %d\n", ++ str, p, VE_OWNER_FSTYPE(sb->s_type)->veid, ++ sb->s_type->name, sb->s_dev, ++ tsk->pid, virt_pid(tsk), tsk->comm, ++ VE_TASK_INFO(tsk)->owner_env->veid, ++ get_exec_env()->veid); ++ ++ free_page(page); ++ ++ print_connection_info(tsk); ++ ++ read_lock(&tasklist_lock); ++ tsk = tsk->real_parent; ++ get_task_struct(tsk); ++ read_unlock(&tasklist_lock); ++ ++ printk(KERN_ALERT "Parent %d/%d[%s] from VE%d\n", ++ tsk->pid, virt_pid(tsk), tsk->comm, ++ VE_TASK_INFO(tsk)->owner_env->veid); ++ ++ print_connection_info(tsk); ++ put_task_struct(tsk); ++ dump_stack(); ++} ++#endif ++ ++int check_area_access_ve(struct dentry *dentry, struct vfsmount *mnt) ++{ ++#ifdef CONFIG_VE ++ int check, alert, deny; ++ ++ if (ve_is_super(get_exec_env())) { ++ check = ve_area_access_check & VE0_AREA_ACC_CHECK; ++ alert = dentry->d_flags & DCACHE_VIRTUAL; ++ deny = ve_area_access_check & VE0_AREA_ACC_DENY; ++ } else { ++ check = ve_area_access_check & VE_AREA_ACC_CHECK; ++ alert = !(dentry->d_flags & DCACHE_VIRTUAL); ++ deny = ve_area_access_check & VE_AREA_ACC_DENY; ++ } ++ ++ if (check && alert) ++ check_alert(mnt, dentry, "Access"); ++ if (deny && alert) ++ return -EACCES; ++#endif ++ return 0; ++} ++ ++int check_area_execute_ve(struct dentry *dentry, struct vfsmount *mnt) ++{ ++#ifdef CONFIG_VE ++ int check, alert, deny; ++ ++ if (ve_is_super(get_exec_env())) { ++ check = ve_area_access_check & VE0_AREA_EXEC_CHECK; ++ alert = dentry->d_flags & DCACHE_VIRTUAL; ++ deny = ve_area_access_check & VE0_AREA_EXEC_DENY; ++ } else { ++ check = ve_area_access_check & VE_AREA_EXEC_CHECK; ++ alert = !(dentry->d_flags & DCACHE_VIRTUAL); ++ deny = ve_area_access_check & VE_AREA_EXEC_DENY; ++ } ++ ++ if (check && alert) ++ check_alert(mnt, dentry, "Exec"); ++ if (deny && alert) ++ return -EACCES; ++#endif ++ return 0; ++} ++ + /* + * NOTE! The user-level library version returns a + * character pointer. The kernel system call just +@@ -1577,10 +2050,12 @@ resume: + goto repeat; + } + atomic_dec(&dentry->d_count); ++ ub_dentry_uncharge(dentry); + } + if (this_parent != root) { + next = this_parent->d_child.next; + atomic_dec(&this_parent->d_count); ++ ub_dentry_uncharge(this_parent); + this_parent = this_parent->d_parent; + goto resume; + } +@@ -1729,7 +2204,8 @@ void __init vfs_caches_init(unsigned lon + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + + filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor); ++ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_UBC, ++ filp_ctor, filp_dtor); + + dcache_init(mempages); + inode_init(mempages); +diff -uprN linux-2.6.15.orig/fs/devpts/inode.c linux-2.6.15-ve025stab014/fs/devpts/inode.c +--- linux-2.6.15.orig/fs/devpts/inode.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/devpts/inode.c 2006-01-27 14:48:08.000000000 +0300 +@@ -12,6 +12,7 @@ + + #include <linux/module.h> + #include <linux/init.h> ++#include <linux/ve.h> + #include <linux/fs.h> + #include <linux/sched.h> + #include <linux/namei.h> +@@ -21,16 +22,17 @@ + + #define DEVPTS_SUPER_MAGIC 0x1cd1 + ++struct devpts_config devpts_config = {.mode = 0600}; ++ ++#ifndef CONFIG_VE + static struct vfsmount *devpts_mnt; + static struct dentry *devpts_root; +- +-static struct { +- int setuid; +- int setgid; +- uid_t uid; +- gid_t gid; +- umode_t mode; +-} config = {.mode = 0600}; ++#define config devpts_config ++#else ++#define devpts_mnt (get_exec_env()->devpts_mnt) ++#define devpts_root (get_exec_env()->devpts_root) ++#define config (*(get_exec_env()->devpts_config)) ++#endif + + static int devpts_remount(struct super_block *sb, int *flags, char *data) + { +@@ -56,7 +58,8 @@ static int devpts_remount(struct super_b + } else if (sscanf(this_char, "mode=%o%c", &n, &dummy) == 1) + mode = n & ~S_IFMT; + else { +- printk("devpts: called with bogus options\n"); ++ ve_printk(VE_LOG, ++ "devpts: called with bogus options\n"); + return -EINVAL; + } + } +@@ -121,6 +124,8 @@ static struct file_system_type devpts_fs + .kill_sb = kill_anon_super, + }; + ++EXPORT_SYMBOL(devpts_fs_type); ++ + /* + * The normal naming convention is simply /dev/pts/<number>; this conforms + * to the System V naming convention +@@ -212,6 +217,7 @@ static int __init init_devpts_fs(void) + + static void __exit exit_devpts_fs(void) + { ++ /* the code is never called, the argument is irrelevant */ + unregister_filesystem(&devpts_fs_type); + mntput(devpts_mnt); + } +diff -uprN linux-2.6.15.orig/fs/exec.c linux-2.6.15-ve025stab014/fs/exec.c +--- linux-2.6.15.orig/fs/exec.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/exec.c 2006-01-27 14:48:08.000000000 +0300 +@@ -53,6 +53,8 @@ + #include <asm/uaccess.h> + #include <asm/mmu_context.h> + ++#include <ub/ub_vmpages.h> ++ + #ifdef CONFIG_KMOD + #include <linux/kmod.h> + #endif +@@ -308,6 +310,10 @@ void install_arg_page(struct vm_area_str + struct mm_struct *mm = vma->vm_mm; + pte_t * pte; + spinlock_t *ptl; ++ struct page_beancounter *pb; ++ ++ if (unlikely(pb_alloc(&pb))) ++ goto out_nopb; + + if (unlikely(anon_vma_prepare(vma))) + goto out; +@@ -321,15 +327,21 @@ void install_arg_page(struct vm_area_str + goto out; + } + inc_mm_counter(mm, anon_rss); ++ inc_vma_rss(vma); + lru_cache_add_active(page); + set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( + page, vma->vm_page_prot)))); ++ pb_add_ref(page, mm, &pb); ++ ub_unused_privvm_dec(mm, vma); ++ pb_free(&pb); + page_add_anon_rmap(page, vma, address); + pte_unmap_unlock(pte, ptl); + + /* no need for flush_tlb */ + return; + out: ++ pb_free(&pb); ++out_nopb: + __free_page(page); + force_sig(SIGKILL, current); + } +@@ -404,9 +416,13 @@ int setup_arg_pages(struct linux_binprm + bprm->loader += stack_base; + bprm->exec += stack_base; + +- mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); ++ if (ub_memory_charge(mm, arg_size, VM_STACK_FLAGS | mm->def_flags, ++ NULL, UB_SOFT)) ++ goto fail_charge; ++ ++ mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL | __GFP_SOFT_UBC); + if (!mpnt) +- return -ENOMEM; ++ goto fail_alloc; + + memset(mpnt, 0, sizeof(*mpnt)); + +@@ -450,6 +466,11 @@ int setup_arg_pages(struct linux_binprm + up_write(&mm->mmap_sem); + + return 0; ++ ++fail_alloc: ++ ub_memory_uncharge(mm, arg_size, VM_STACK_FLAGS | mm->def_flags, NULL); ++fail_charge: ++ return -ENOMEM; + } + + EXPORT_SYMBOL(setup_arg_pages); +@@ -657,7 +678,7 @@ static inline int de_thread(struct task_ + */ + if (!thread_group_leader(current)) { + struct task_struct *parent; +- struct dentry *proc_dentry1, *proc_dentry2; ++ struct dentry *proc_dentry1[2], *proc_dentry2[2]; + unsigned long ptrace; + + /* +@@ -671,8 +692,8 @@ static inline int de_thread(struct task_ + + spin_lock(&leader->proc_lock); + spin_lock(¤t->proc_lock); +- proc_dentry1 = proc_pid_unhash(current); +- proc_dentry2 = proc_pid_unhash(leader); ++ proc_pid_unhash(current, proc_dentry1); ++ proc_pid_unhash(leader, proc_dentry2); + write_lock_irq(&tasklist_lock); + + BUG_ON(leader->tgid != current->tgid); +@@ -891,6 +912,7 @@ int flush_old_exec(struct linux_binprm * + suid_keys(current); + current->mm->dumpable = suid_dumpable; + } ++ current->mm->vps_dumpable = 1; + + /* An exec changes our domain. We are no longer part of the thread + group */ +@@ -1282,7 +1304,7 @@ static void format_corename(char *corena + case 'p': + pid_in_pattern = 1; + rc = snprintf(out_ptr, out_end - out_ptr, +- "%d", current->tgid); ++ "%d", virt_tgid(current)); + if (rc > out_end - out_ptr) + goto out; + out_ptr += rc; +@@ -1326,7 +1348,7 @@ static void format_corename(char *corena + case 'h': + down_read(&uts_sem); + rc = snprintf(out_ptr, out_end - out_ptr, +- "%s", system_utsname.nodename); ++ "%s", ve_utsname.nodename); + up_read(&uts_sem); + if (rc > out_end - out_ptr) + goto out; +@@ -1354,7 +1376,7 @@ static void format_corename(char *corena + if (!pid_in_pattern + && (core_uses_pid || atomic_read(¤t->mm->mm_users) != 1)) { + rc = snprintf(out_ptr, out_end - out_ptr, +- ".%d", current->tgid); ++ ".%d", virt_tgid(current)); + if (rc > out_end - out_ptr) + goto out; + out_ptr += rc; +@@ -1380,7 +1402,7 @@ static void zap_threads (struct mm_struc + } + + read_lock(&tasklist_lock); +- do_each_thread(g,p) ++ do_each_thread_ve(g,p) + if (mm == p->mm && p != tsk) { + force_sig_specific(SIGKILL, p); + mm->core_waiters++; +@@ -1388,7 +1410,7 @@ static void zap_threads (struct mm_struc + unlikely(p->parent->mm == mm)) + traced = 1; + } +- while_each_thread(g,p); ++ while_each_thread_ve(g,p); + + read_unlock(&tasklist_lock); + +@@ -1400,12 +1422,12 @@ static void zap_threads (struct mm_struc + * coredump to finish. Detach them so they can both die. + */ + write_lock_irq(&tasklist_lock); +- do_each_thread(g,p) { ++ do_each_thread_ve(g,p) { + if (mm == p->mm && p != tsk && + p->ptrace && p->parent->mm == mm) { + __ptrace_unlink(p); + } +- } while_each_thread(g,p); ++ } while_each_thread_ve(g,p); + write_unlock_irq(&tasklist_lock); + } + } +@@ -1441,7 +1463,8 @@ int do_coredump(long signr, int exit_cod + if (!binfmt || !binfmt->core_dump) + goto fail; + down_write(&mm->mmap_sem); +- if (!mm->dumpable) { ++ if (!mm->dumpable || ++ (!mm->vps_dumpable && !ve_is_super(get_exec_env()))) { + up_write(&mm->mmap_sem); + goto fail; + } +diff -uprN linux-2.6.15.orig/fs/ext2/namei.c linux-2.6.15-ve025stab014/fs/ext2/namei.c +--- linux-2.6.15.orig/fs/ext2/namei.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/ext2/namei.c 2006-01-27 14:48:08.000000000 +0300 +@@ -31,6 +31,7 @@ + */ + + #include <linux/pagemap.h> ++#include <linux/quotaops.h> + #include "ext2.h" + #include "xattr.h" + #include "acl.h" +@@ -276,6 +277,8 @@ static int ext2_unlink(struct inode * di + struct page * page; + int err = -ENOENT; + ++ DQUOT_INIT(inode); ++ + de = ext2_find_entry (dir, dentry, &page); + if (!de) + goto out; +@@ -318,6 +321,9 @@ static int ext2_rename (struct inode * o + struct ext2_dir_entry_2 * old_de; + int err = -ENOENT; + ++ if (new_inode) ++ DQUOT_INIT(new_inode); ++ + old_de = ext2_find_entry (old_dir, old_dentry, &old_page); + if (!old_de) + goto out; +diff -uprN linux-2.6.15.orig/fs/ext2/super.c linux-2.6.15-ve025stab014/fs/ext2/super.c +--- linux-2.6.15.orig/fs/ext2/super.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/ext2/super.c 2006-01-27 14:48:08.000000000 +0300 +@@ -1200,7 +1200,7 @@ static struct file_system_type ext2_fs_t + .name = "ext2", + .get_sb = ext2_get_sb, + .kill_sb = kill_block_super, +- .fs_flags = FS_REQUIRES_DEV, ++ .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED, + }; + + static int __init init_ext2_fs(void) +diff -uprN linux-2.6.15.orig/fs/ext3/inode.c linux-2.6.15-ve025stab014/fs/ext3/inode.c +--- linux-2.6.15.orig/fs/ext3/inode.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/ext3/inode.c 2006-01-27 14:48:05.000000000 +0300 +@@ -2728,6 +2728,10 @@ int ext3_write_inode(struct inode *inode + { + if (current->flags & PF_MEMALLOC) + return 0; ++#ifdef CONFIG_USER_RESOURCE ++ if (test_ti_thread_flag(current->thread_info, TIF_MEMDIE)) ++ return 0; ++#endif + + if (ext3_journal_current_handle()) { + jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n"); +diff -uprN linux-2.6.15.orig/fs/ext3/super.c linux-2.6.15-ve025stab014/fs/ext3/super.c +--- linux-2.6.15.orig/fs/ext3/super.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/ext3/super.c 2006-01-27 14:48:08.000000000 +0300 +@@ -2627,7 +2627,7 @@ static struct file_system_type ext3_fs_t + .name = "ext3", + .get_sb = ext3_get_sb, + .kill_sb = kill_block_super, +- .fs_flags = FS_REQUIRES_DEV, ++ .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED, + }; + + static int __init init_ext3_fs(void) +diff -uprN linux-2.6.15.orig/fs/fcntl.c linux-2.6.15-ve025stab014/fs/fcntl.c +--- linux-2.6.15.orig/fs/fcntl.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/fcntl.c 2006-01-27 14:48:08.000000000 +0300 +@@ -17,6 +17,7 @@ + #include <linux/ptrace.h> + #include <linux/signal.h> + #include <linux/rcupdate.h> ++#include <linux/ve_owner.h> + + #include <asm/poll.h> + #include <asm/siginfo.h> +@@ -250,6 +251,7 @@ static int setfl(int fd, struct file * f + static void f_modown(struct file *filp, unsigned long pid, + uid_t uid, uid_t euid, int force) + { ++ pid = comb_vpid_to_pid(pid); + write_lock_irq(&filp->f_owner.lock); + if (force || !filp->f_owner.pid) { + filp->f_owner.pid = pid; +@@ -316,7 +318,7 @@ static long do_fcntl(int fd, unsigned in + * current syscall conventions, the only way + * to fix this will be in libc. + */ +- err = filp->f_owner.pid; ++ err = comb_pid_to_vpid(filp->f_owner.pid); + force_successful_syscall_return(); + break; + case F_SETOWN: +@@ -468,23 +470,29 @@ static void send_sigio_to_task(struct ta + void send_sigio(struct fown_struct *fown, int fd, int band) + { + struct task_struct *p; ++ struct file *f; ++ struct ve_struct *ve; + int pid; + + read_lock(&fown->lock); + pid = fown->pid; + if (!pid) + goto out_unlock_fown; ++ ++ /* hack: fown's are always embedded in struct file */ ++ f = container_of(fown, struct file, f_owner); ++ ve = VE_OWNER_FILP(f); + + read_lock(&tasklist_lock); + if (pid > 0) { +- p = find_task_by_pid(pid); +- if (p) { ++ p = find_task_by_pid_all(pid); ++ if (p && ve_accessible(VE_TASK_INFO(p)->owner_env, ve)) { + send_sigio_to_task(p, fown, fd, band); + } + } else { +- do_each_task_pid(-pid, PIDTYPE_PGID, p) { ++ __do_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve) { + send_sigio_to_task(p, fown, fd, band); +- } while_each_task_pid(-pid, PIDTYPE_PGID, p); ++ } __while_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve); + } + read_unlock(&tasklist_lock); + out_unlock_fown: +@@ -501,6 +509,8 @@ static void send_sigurg_to_task(struct t + int send_sigurg(struct fown_struct *fown) + { + struct task_struct *p; ++ struct file *f; ++ struct ve_struct *ve; + int pid, ret = 0; + + read_lock(&fown->lock); +@@ -509,17 +519,19 @@ int send_sigurg(struct fown_struct *fown + goto out_unlock_fown; + + ret = 1; ++ f = container_of(fown, struct file, f_owner); ++ ve = VE_OWNER_FILP(f); + + read_lock(&tasklist_lock); + if (pid > 0) { +- p = find_task_by_pid(pid); +- if (p) { ++ p = find_task_by_pid_all(pid); ++ if (p && ve_accessible(VE_TASK_INFO(p)->owner_env, ve)) { + send_sigurg_to_task(p, fown); + } + } else { +- do_each_task_pid(-pid, PIDTYPE_PGID, p) { ++ __do_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve) { + send_sigurg_to_task(p, fown); +- } while_each_task_pid(-pid, PIDTYPE_PGID, p); ++ } __while_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve); + } + read_unlock(&tasklist_lock); + out_unlock_fown: +diff -uprN linux-2.6.15.orig/fs/file.c linux-2.6.15-ve025stab014/fs/file.c +--- linux-2.6.15.orig/fs/file.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/file.c 2006-01-27 14:48:05.000000000 +0300 +@@ -18,6 +18,8 @@ + #include <linux/rcupdate.h> + #include <linux/workqueue.h> + ++#include <ub/ub_mem.h> ++ + struct fdtable_defer { + spinlock_t lock; + struct work_struct wq; +@@ -44,9 +46,9 @@ struct file ** alloc_fd_array(int num) + int size = num * sizeof(struct file *); + + if (size <= PAGE_SIZE) +- new_fds = (struct file **) kmalloc(size, GFP_KERNEL); ++ new_fds = (struct file **) ub_kmalloc(size, GFP_KERNEL); + else +- new_fds = (struct file **) vmalloc(size); ++ new_fds = (struct file **) ub_vmalloc(size); + return new_fds; + } + +@@ -212,9 +214,9 @@ fd_set * alloc_fdset(int num) + int size = num / 8; + + if (size <= PAGE_SIZE) +- new_fdset = (fd_set *) kmalloc(size, GFP_KERNEL); ++ new_fdset = (fd_set *) ub_kmalloc(size, GFP_KERNEL); + else +- new_fdset = (fd_set *) vmalloc(size); ++ new_fdset = (fd_set *) ub_vmalloc(size); + return new_fdset; + } + +diff -uprN linux-2.6.15.orig/fs/file_table.c linux-2.6.15-ve025stab014/fs/file_table.c +--- linux-2.6.15.orig/fs/file_table.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/file_table.c 2006-01-27 14:48:08.000000000 +0300 +@@ -8,6 +8,7 @@ + #include <linux/string.h> + #include <linux/slab.h> + #include <linux/file.h> ++#include <linux/ve_owner.h> + #include <linux/init.h> + #include <linux/module.h> + #include <linux/smp_lock.h> +@@ -19,6 +20,8 @@ + #include <linux/cdev.h> + #include <linux/fsnotify.h> + ++#include <ub/ub_misc.h> ++ + /* sysctl tunables... */ + struct files_stat_struct files_stat = { + .max_files = NR_FILE +@@ -57,6 +60,8 @@ void filp_dtor(void *objp, struct kmem_c + static inline void file_free_rcu(struct rcu_head *head) + { + struct file *f = container_of(head, struct file, f_u.fu_rcuhead); ++ ub_file_uncharge(f); ++ put_ve(VE_OWNER_FILP(f)); + kmem_cache_free(filp_cachep, f); + } + +@@ -86,6 +91,11 @@ struct file *get_empty_filp(void) + goto fail; + + memset(f, 0, sizeof(*f)); ++ SET_VE_OWNER_FILP(f, get_ve(get_exec_env())); ++ ++ if (ub_file_charge(f)) ++ goto fail_ch; ++ + if (security_file_alloc(f)) + goto fail_sec; + +@@ -111,6 +121,10 @@ fail_sec: + file_free(f); + fail: + return NULL; ++ ++fail_ch: ++ kmem_cache_free(filp_cachep, f); ++ return NULL; + } + + EXPORT_SYMBOL(get_empty_filp); +diff -uprN linux-2.6.15.orig/fs/filesystems.c linux-2.6.15-ve025stab014/fs/filesystems.c +--- linux-2.6.15.orig/fs/filesystems.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/filesystems.c 2006-01-27 14:48:08.000000000 +0300 +@@ -13,6 +13,7 @@ + #include <linux/init.h> + #include <linux/module.h> + #include <linux/sched.h> /* for 'current' */ ++#include <linux/ve_owner.h> + #include <asm/uaccess.h> + + /* +@@ -22,8 +23,8 @@ + * During the unload module must call unregister_filesystem(). + * We can access the fields of list element if: + * 1) spinlock is held or +- * 2) we hold the reference to the module. +- * The latter can be guaranteed by call of try_module_get(); if it ++ * 2) we hold the reference to the element. ++ * The latter can be guaranteed by call of try_filesystem(); if it + * returned 0 we must skip the element, otherwise we got the reference. + * Once the reference is obtained we can drop the spinlock. + */ +@@ -31,23 +32,51 @@ + static struct file_system_type *file_systems; + static DEFINE_RWLOCK(file_systems_lock); + ++int try_get_filesystem(struct file_system_type *fs) ++{ ++ if (try_module_get(fs->owner)) { ++#ifdef CONFIG_VE ++ get_ve(VE_OWNER_FSTYPE(fs)); ++#endif ++ return 1; ++ } ++ return 0; ++} ++ + /* WARNING: This can be used only if we _already_ own a reference */ + void get_filesystem(struct file_system_type *fs) + { ++#ifdef CONFIG_VE ++ get_ve(VE_OWNER_FSTYPE(fs)); ++#endif + __module_get(fs->owner); + } + + void put_filesystem(struct file_system_type *fs) + { + module_put(fs->owner); ++#ifdef CONFIG_VE ++ put_ve(VE_OWNER_FSTYPE(fs)); ++#endif ++} ++ ++static inline int check_ve_fstype(struct file_system_type *p, ++ struct ve_struct *env) ++{ ++ return ((p->fs_flags & FS_VIRTUALIZED) || ++ ve_accessible_strict(VE_OWNER_FSTYPE(p), env)); + } + +-static struct file_system_type **find_filesystem(const char *name) ++static struct file_system_type **find_filesystem(const char *name, ++ struct ve_struct *env) + { + struct file_system_type **p; +- for (p=&file_systems; *p; p=&(*p)->next) ++ for (p=&file_systems; *p; p=&(*p)->next) { ++ if (!check_ve_fstype(*p, env)) ++ continue; + if (strcmp((*p)->name,name) == 0) + break; ++ } + return p; + } + +@@ -74,8 +103,10 @@ int register_filesystem(struct file_syst + if (fs->next) + return -EBUSY; + INIT_LIST_HEAD(&fs->fs_supers); ++ if (VE_OWNER_FSTYPE(fs) == NULL) ++ SET_VE_OWNER_FSTYPE(fs, get_ve0()); + write_lock(&file_systems_lock); +- p = find_filesystem(fs->name); ++ p = find_filesystem(fs->name, VE_OWNER_FSTYPE(fs)); + if (*p) + res = -EBUSY; + else +@@ -132,11 +163,14 @@ static int fs_index(const char __user * + + err = -EINVAL; + read_lock(&file_systems_lock); +- for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { ++ for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next) { ++ if (!check_ve_fstype(tmp, get_exec_env())) ++ continue; + if (strcmp(tmp->name,name) == 0) { + err = index; + break; + } ++ index++; + } + read_unlock(&file_systems_lock); + putname(name); +@@ -149,9 +183,15 @@ static int fs_name(unsigned int index, c + int len, res; + + read_lock(&file_systems_lock); +- for (tmp = file_systems; tmp; tmp = tmp->next, index--) +- if (index <= 0 && try_module_get(tmp->owner)) +- break; ++ for (tmp = file_systems; tmp; tmp = tmp->next) { ++ if (!check_ve_fstype(tmp, get_exec_env())) ++ continue; ++ if (!index) { ++ if (try_get_filesystem(tmp)) ++ break; ++ } else ++ index--; ++ } + read_unlock(&file_systems_lock); + if (!tmp) + return -EINVAL; +@@ -169,8 +209,9 @@ static int fs_maxindex(void) + int index; + + read_lock(&file_systems_lock); +- for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) +- ; ++ for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next) ++ if (check_ve_fstype(tmp, get_exec_env())) ++ index++; + read_unlock(&file_systems_lock); + return index; + } +@@ -206,9 +247,10 @@ int get_filesystem_list(char * buf) + read_lock(&file_systems_lock); + tmp = file_systems; + while (tmp && len < PAGE_SIZE - 80) { +- len += sprintf(buf+len, "%s\t%s\n", +- (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", +- tmp->name); ++ if (check_ve_fstype(tmp, get_exec_env())) ++ len += sprintf(buf+len, "%s\t%s\n", ++ (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", ++ tmp->name); + tmp = tmp->next; + } + read_unlock(&file_systems_lock); +@@ -220,14 +262,14 @@ struct file_system_type *get_fs_type(con + struct file_system_type *fs; + + read_lock(&file_systems_lock); +- fs = *(find_filesystem(name)); +- if (fs && !try_module_get(fs->owner)) ++ fs = *(find_filesystem(name, get_exec_env())); ++ if (fs && !try_get_filesystem(fs)) + fs = NULL; + read_unlock(&file_systems_lock); + if (!fs && (request_module("%s", name) == 0)) { + read_lock(&file_systems_lock); +- fs = *(find_filesystem(name)); +- if (fs && !try_module_get(fs->owner)) ++ fs = *(find_filesystem(name, get_exec_env())); ++ if (fs && !try_get_filesystem(fs)) + fs = NULL; + read_unlock(&file_systems_lock); + } +@@ -235,3 +277,5 @@ struct file_system_type *get_fs_type(con + } + + EXPORT_SYMBOL(get_fs_type); ++EXPORT_SYMBOL(get_filesystem); ++EXPORT_SYMBOL(put_filesystem); +diff -uprN linux-2.6.15.orig/fs/hugetlbfs/inode.c linux-2.6.15-ve025stab014/fs/hugetlbfs/inode.c +--- linux-2.6.15.orig/fs/hugetlbfs/inode.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/hugetlbfs/inode.c 2006-01-27 14:48:08.000000000 +0300 +@@ -802,7 +802,7 @@ struct file *hugetlb_zero_setup(size_t s + struct inode *inode; + struct dentry *dentry, *root; + struct qstr quick_string; +- char buf[16]; ++ char buf[64]; + + if (!can_do_hugetlb_shm()) + return ERR_PTR(-EPERM); +@@ -814,7 +814,8 @@ struct file *hugetlb_zero_setup(size_t s + return ERR_PTR(-ENOMEM); + + root = hugetlbfs_vfsmount->mnt_root; +- snprintf(buf, 16, "%lu", hugetlbfs_counter()); ++ snprintf(buf, sizeof(buf), "VE%d-%lu", ++ get_exec_env()->veid, hugetlbfs_counter()); + quick_string.name = buf; + quick_string.len = strlen(quick_string.name); + quick_string.hash = 0; +diff -uprN linux-2.6.15.orig/fs/inode.c linux-2.6.15-ve025stab014/fs/inode.c +--- linux-2.6.15.orig/fs/inode.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/inode.c 2006-01-27 14:48:08.000000000 +0300 +@@ -9,6 +9,7 @@ + #include <linux/mm.h> + #include <linux/dcache.h> + #include <linux/init.h> ++#include <linux/kernel_stat.h> + #include <linux/quotaops.h> + #include <linux/slab.h> + #include <linux/writeback.h> +@@ -97,13 +98,15 @@ DECLARE_MUTEX(iprune_sem); + */ + struct inodes_stat_t inodes_stat; + +-static kmem_cache_t * inode_cachep; ++kmem_cache_t *inode_cachep; ++ ++static struct address_space_operations vfs_empty_aops; ++struct inode_operations vfs_empty_iops; ++static struct file_operations vfs_empty_fops; ++EXPORT_SYMBOL(vfs_empty_iops); + + static struct inode *alloc_inode(struct super_block *sb) + { +- static struct address_space_operations empty_aops; +- static struct inode_operations empty_iops; +- static struct file_operations empty_fops; + struct inode *inode; + + if (sb->s_op->alloc_inode) +@@ -118,8 +121,8 @@ static struct inode *alloc_inode(struct + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); +- inode->i_op = &empty_iops; +- inode->i_fop = &empty_fops; ++ inode->i_op = &vfs_empty_iops; ++ inode->i_fop = &vfs_empty_fops; + inode->i_nlink = 1; + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; +@@ -143,7 +146,7 @@ static struct inode *alloc_inode(struct + return NULL; + } + +- mapping->a_ops = &empty_aops; ++ mapping->a_ops = &vfs_empty_aops; + mapping->host = inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_HIGHUSER); +@@ -477,6 +480,7 @@ static void prune_icache(int nr_to_scan) + */ + static int shrink_icache_memory(int nr, gfp_t gfp_mask) + { ++ KSTAT_PERF_ENTER(shrink_icache) + if (nr) { + /* + * Nasty deadlock avoidance. We may hold various FS locks, +@@ -487,6 +491,7 @@ static int shrink_icache_memory(int nr, + return -1; + prune_icache(nr); + } ++ KSTAT_PERF_LEAVE(shrink_icache) + return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; + } + +@@ -736,7 +741,7 @@ EXPORT_SYMBOL(iunique); + struct inode *igrab(struct inode *inode) + { + spin_lock(&inode_lock); +- if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) ++ if (inode && !(inode->i_state & (I_FREEING|I_WILL_FREE))) + __iget(inode); + else + /* +diff -uprN linux-2.6.15.orig/fs/ioprio.c linux-2.6.15-ve025stab014/fs/ioprio.c +--- linux-2.6.15.orig/fs/ioprio.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/ioprio.c 2006-01-27 14:48:08.000000000 +0300 +@@ -77,18 +77,18 @@ asmlinkage long sys_ioprio_set(int which + if (!who) + p = current; + else +- p = find_task_by_pid(who); ++ p = find_task_by_pid_all(who); + if (p) + ret = set_task_ioprio(p, ioprio); + break; + case IOPRIO_WHO_PGRP: + if (!who) + who = process_group(current); +- do_each_task_pid(who, PIDTYPE_PGID, p) { ++ do_each_task_pid_all(who, PIDTYPE_PGID, p) { + ret = set_task_ioprio(p, ioprio); + if (ret) + break; +- } while_each_task_pid(who, PIDTYPE_PGID, p); ++ } while_each_task_pid_all(who, PIDTYPE_PGID, p); + break; + case IOPRIO_WHO_USER: + if (!who) +@@ -99,13 +99,13 @@ asmlinkage long sys_ioprio_set(int which + if (!user) + break; + +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + if (p->uid != who) + continue; + ret = set_task_ioprio(p, ioprio); + if (ret) + break; +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + + if (who) + free_uid(user); +@@ -130,19 +130,19 @@ asmlinkage long sys_ioprio_get(int which + if (!who) + p = current; + else +- p = find_task_by_pid(who); ++ p = find_task_by_pid_ve(who); + if (p) + ret = p->ioprio; + break; + case IOPRIO_WHO_PGRP: + if (!who) + who = process_group(current); +- do_each_task_pid(who, PIDTYPE_PGID, p) { ++ do_each_task_pid_ve(who, PIDTYPE_PGID, p) { + if (ret == -ESRCH) + ret = p->ioprio; + else + ret = ioprio_best(ret, p->ioprio); +- } while_each_task_pid(who, PIDTYPE_PGID, p); ++ } while_each_task_pid_ve(who, PIDTYPE_PGID, p); + break; + case IOPRIO_WHO_USER: + if (!who) +@@ -153,14 +153,14 @@ asmlinkage long sys_ioprio_get(int which + if (!user) + break; + +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + if (p->uid != user->uid) + continue; + if (ret == -ESRCH) + ret = p->ioprio; + else + ret = ioprio_best(ret, p->ioprio); +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + + if (who) + free_uid(user); +diff -uprN linux-2.6.15.orig/fs/jbd/transaction.c linux-2.6.15-ve025stab014/fs/jbd/transaction.c +--- linux-2.6.15.orig/fs/jbd/transaction.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/jbd/transaction.c 2006-01-27 14:48:05.000000000 +0300 +@@ -1379,6 +1379,9 @@ int journal_stop(handle_t *handle) + * to wait for the commit to complete. + */ + if (handle->h_sync && !(current->flags & PF_MEMALLOC)) ++#ifdef CONFIG_USER_RESOURCE ++ if (!test_ti_thread_flag(current->thread_info, TIF_MEMDIE)) ++#endif + err = log_wait_commit(journal, tid); + } else { + spin_unlock(&transaction->t_handle_lock); +diff -uprN linux-2.6.15.orig/fs/lockd/clntproc.c linux-2.6.15-ve025stab014/fs/lockd/clntproc.c +--- linux-2.6.15.orig/fs/lockd/clntproc.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/lockd/clntproc.c 2006-01-27 14:48:08.000000000 +0300 +@@ -127,10 +127,10 @@ static void nlmclnt_setlockargs(struct n + nlmclnt_next_cookie(&argp->cookie); + argp->state = nsm_local_state; + memcpy(&lock->fh, NFS_FH(fl->fl_file->f_dentry->d_inode), sizeof(struct nfs_fh)); +- lock->caller = system_utsname.nodename; ++ lock->caller = ve_utsname.nodename; + lock->oh.data = req->a_owner; + lock->oh.len = sprintf(req->a_owner, "%d@%s", +- current->pid, system_utsname.nodename); ++ current->pid, ve_utsname.nodename); + locks_copy_lock(&lock->fl, fl); + } + +@@ -151,7 +151,7 @@ nlmclnt_setgrantargs(struct nlm_rqst *ca + { + locks_copy_lock(&call->a_args.lock.fl, &lock->fl); + memcpy(&call->a_args.lock.fh, &lock->fh, sizeof(call->a_args.lock.fh)); +- call->a_args.lock.caller = system_utsname.nodename; ++ call->a_args.lock.caller = ve_utsname.nodename; + call->a_args.lock.oh.len = lock->oh.len; + + /* set default data area */ +diff -uprN linux-2.6.15.orig/fs/lockd/mon.c linux-2.6.15-ve025stab014/fs/lockd/mon.c +--- linux-2.6.15.orig/fs/lockd/mon.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/lockd/mon.c 2006-01-27 14:48:08.000000000 +0300 +@@ -148,7 +148,7 @@ xdr_encode_common(struct rpc_rqst *rqstp + */ + sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr)); + if (!(p = xdr_encode_string(p, buffer)) +- || !(p = xdr_encode_string(p, system_utsname.nodename))) ++ || !(p = xdr_encode_string(p, ve_utsname.nodename))) + return ERR_PTR(-EIO); + *p++ = htonl(argp->prog); + *p++ = htonl(argp->vers); +diff -uprN linux-2.6.15.orig/fs/locks.c linux-2.6.15-ve025stab014/fs/locks.c +--- linux-2.6.15.orig/fs/locks.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/locks.c 2006-01-27 14:48:08.000000000 +0300 +@@ -129,6 +129,8 @@ + #include <asm/semaphore.h> + #include <asm/uaccess.h> + ++#include <ub/ub_misc.h> ++ + #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) + #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) + #define IS_LEASE(fl) (fl->fl_flags & FL_LEASE) +@@ -148,11 +150,28 @@ static LIST_HEAD(blocked_list); + static kmem_cache_t *filelock_cache; + + /* Allocate an empty lock structure. */ +-static struct file_lock *locks_alloc_lock(void) ++static struct file_lock *locks_alloc_lock(int charge) + { +- return kmem_cache_alloc(filelock_cache, SLAB_KERNEL); ++ struct file_lock *fl; ++ ++ fl = kmem_cache_alloc(filelock_cache, SLAB_KERNEL); ++#ifdef CONFIG_USER_RESOURCE ++ if (fl == NULL) ++ goto out; ++ fl->fl_charged = 0; ++ if (!charge) ++ goto out; ++ if (!ub_flock_charge(fl, 1)) ++ goto out; ++ ++ kmem_cache_free(filelock_cache, fl); ++ fl = NULL; ++out: ++#endif ++ return fl; + } + ++ + /* Free a lock which is not in use. */ + static inline void locks_free_lock(struct file_lock *fl) + { +@@ -181,6 +200,7 @@ static inline void locks_free_lock(struc + fl->fl_lmops = NULL; + } + ++ ub_flock_uncharge(fl); + kmem_cache_free(filelock_cache, fl); + } + +@@ -263,7 +283,7 @@ static int flock_make_lock(struct file * + if (type < 0) + return type; + +- fl = locks_alloc_lock(); ++ fl = locks_alloc_lock(type != F_UNLCK); + if (fl == NULL) + return -ENOMEM; + +@@ -451,7 +471,7 @@ static int lease_init(struct file *filp, + /* Allocate a file_lock initialised to this type of lease */ + static int lease_alloc(struct file *filp, int type, struct file_lock **flp) + { +- struct file_lock *fl = locks_alloc_lock(); ++ struct file_lock *fl = locks_alloc_lock(1); + int error; + + if (fl == NULL) +@@ -785,8 +805,11 @@ static int __posix_lock_file(struct inod + * We may need two file_lock structures for this operation, + * so we get them in advance to avoid races. + */ +- new_fl = locks_alloc_lock(); +- new_fl2 = locks_alloc_lock(); ++ if (request->fl_type != F_UNLCK) ++ new_fl = locks_alloc_lock(1); ++ else ++ new_fl = NULL; ++ new_fl2 = locks_alloc_lock(0); + + lock_kernel(); + if (request->fl_type != F_UNLCK) { +@@ -814,7 +837,7 @@ static int __posix_lock_file(struct inod + goto out; + + error = -ENOLCK; /* "no luck" */ +- if (!(new_fl && new_fl2)) ++ if (!((request->fl_type == F_UNLCK || new_fl) && new_fl2)) + goto out; + + /* +@@ -920,19 +943,30 @@ static int __posix_lock_file(struct inod + if (!added) { + if (request->fl_type == F_UNLCK) + goto out; ++ error = -ENOLCK; ++ if (right && (left == right) && ub_flock_charge(new_fl, 1)) ++ goto out; + locks_copy_lock(new_fl, request); + locks_insert_lock(before, new_fl); + new_fl = NULL; ++ error = 0; + } + if (right) { + if (left == right) { + /* The new lock breaks the old one in two pieces, + * so we have to use the second new lock. + */ ++ error = -ENOLCK; ++ if (added && ub_flock_charge(new_fl2, ++ request->fl_type != F_UNLCK)) ++ goto out; ++ /* FIXME move all fl_charged manipulations in ub code */ ++ set_flock_charged(new_fl2); + left = new_fl2; + new_fl2 = NULL; + locks_copy_lock(left, right); + locks_insert_lock(before, left); ++ error = 0; + } + right->fl_start = request->fl_end + 1; + locks_wake_up_blocks(right); +@@ -1574,7 +1608,7 @@ int fcntl_getlk(struct file *filp, struc + + flock.l_type = F_UNLCK; + if (fl != NULL) { +- flock.l_pid = fl->fl_pid; ++ flock.l_pid = pid_type_to_vpid(PIDTYPE_TGID, fl->fl_pid); + #if BITS_PER_LONG == 32 + /* + * Make sure we can represent the posix lock via +@@ -1606,7 +1640,7 @@ out: + int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, + struct flock __user *l) + { +- struct file_lock *file_lock = locks_alloc_lock(); ++ struct file_lock *file_lock = locks_alloc_lock(0); + struct flock flock; + struct inode *inode; + int error; +@@ -1728,7 +1762,7 @@ int fcntl_getlk64(struct file *filp, str + + flock.l_type = F_UNLCK; + if (fl != NULL) { +- flock.l_pid = fl->fl_pid; ++ flock.l_pid = pid_type_to_vpid(PIDTYPE_TGID, fl->fl_pid); + flock.l_start = fl->fl_start; + flock.l_len = fl->fl_end == OFFSET_MAX ? 0 : + fl->fl_end - fl->fl_start + 1; +@@ -1749,7 +1783,7 @@ out: + int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, + struct flock64 __user *l) + { +- struct file_lock *file_lock = locks_alloc_lock(); ++ struct file_lock *file_lock = locks_alloc_lock(0); + struct flock64 flock; + struct inode *inode; + int error; +@@ -1981,7 +2015,9 @@ EXPORT_SYMBOL(posix_unblock_lock); + static void lock_get_status(char* out, struct file_lock *fl, int id, char *pfx) + { + struct inode *inode = NULL; ++ unsigned int fl_pid; + ++ fl_pid = pid_type_to_vpid(PIDTYPE_TGID, fl->fl_pid); + if (fl->fl_file != NULL) + inode = fl->fl_file->f_dentry->d_inode; + +@@ -2023,16 +2059,16 @@ static void lock_get_status(char* out, s + } + if (inode) { + #ifdef WE_CAN_BREAK_LSLK_NOW +- out += sprintf(out, "%d %s:%ld ", fl->fl_pid, ++ out += sprintf(out, "%d %s:%ld ", fl_pid, + inode->i_sb->s_id, inode->i_ino); + #else + /* userspace relies on this representation of dev_t ;-( */ +- out += sprintf(out, "%d %02x:%02x:%ld ", fl->fl_pid, ++ out += sprintf(out, "%d %02x:%02x:%ld ", fl_pid, + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), inode->i_ino); + #endif + } else { +- out += sprintf(out, "%d <none>:0 ", fl->fl_pid); ++ out += sprintf(out, "%d <none>:0 ", fl_pid); + } + if (IS_POSIX(fl)) { + if (fl->fl_end == OFFSET_MAX) +@@ -2081,11 +2117,17 @@ int get_locks_status(char *buffer, char + char *q = buffer; + off_t pos = 0; + int i = 0; ++ struct ve_struct *env; + + lock_kernel(); ++ env = get_exec_env(); + list_for_each(tmp, &file_lock_list) { + struct list_head *btmp; + struct file_lock *fl = list_entry(tmp, struct file_lock, fl_link); ++ ++ if (!ve_accessible(VE_OWNER_FILP(fl->fl_file), env)) ++ continue; ++ + lock_get_status(q, fl, ++i, ""); + move_lock_status(&q, &pos, offset); + +@@ -2243,7 +2285,7 @@ EXPORT_SYMBOL(steal_locks); + static int __init filelock_init(void) + { + filelock_cache = kmem_cache_create("file_lock_cache", +- sizeof(struct file_lock), 0, SLAB_PANIC, ++ sizeof(struct file_lock), 0, SLAB_PANIC | SLAB_UBC, + init_once, NULL); + return 0; + } +diff -uprN linux-2.6.15.orig/fs/namei.c linux-2.6.15-ve025stab014/fs/namei.c +--- linux-2.6.15.orig/fs/namei.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/namei.c 2006-01-27 14:48:08.000000000 +0300 +@@ -701,7 +701,14 @@ static inline void follow_dotdot(struct + read_unlock(¤t->fs->lock); + break; + } +- read_unlock(¤t->fs->lock); ++#ifdef CONFIG_VE ++ if (nd->dentry == get_exec_env()->fs_root && ++ nd->mnt == get_exec_env()->fs_rootmnt) { ++ read_unlock(¤t->fs->lock); ++ break; ++ } ++#endif ++ read_unlock(¤t->fs->lock); + spin_lock(&dcache_lock); + if (nd->dentry != nd->mnt->mnt_root) { + nd->dentry = dget(nd->dentry->d_parent); +@@ -742,6 +749,10 @@ static int do_lookup(struct nameidata *n + if (dentry->d_op && dentry->d_op->d_revalidate) + goto need_revalidate; + done: ++ if ((nd->flags & LOOKUP_STRICT) && d_mountpoint(dentry)) { ++ dput(dentry); ++ return -ENOENT; ++ } + path->mnt = mnt; + path->dentry = dentry; + __follow_mount(path); +@@ -861,6 +872,9 @@ static fastcall int __link_path_walk(con + goto out_dput; + + if (inode->i_op->follow_link) { ++ err = -ENOENT; ++ if (lookup_flags & LOOKUP_STRICT) ++ goto out_dput; + err = do_follow_link(&next, nd); + if (err) + goto return_err; +@@ -907,6 +921,7 @@ last_component: + break; + inode = next.dentry->d_inode; + if ((lookup_flags & LOOKUP_FOLLOW) ++ && !(lookup_flags & LOOKUP_STRICT) + && inode && inode->i_op && inode->i_op->follow_link) { + err = do_follow_link(&next, nd); + if (err) +@@ -947,6 +962,11 @@ return_reval: + break; + } + return_base: ++ if (!(nd->flags & LOOKUP_NOAREACHECK)) { ++ err = check_area_access_ve(nd->dentry, nd->mnt); ++ if (err) ++ break; ++ } + return 0; + out_dput: + dput_path(&next, nd); +@@ -2278,6 +2298,9 @@ int vfs_rename(struct inode *old_dir, st + int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); + const char *old_name; + ++ if (DQUOT_RENAME(old_dentry->d_inode, old_dir, new_dir)) ++ return -EXDEV; ++ + if (old_dentry->d_inode == new_dentry->d_inode) + return 0; + +diff -uprN linux-2.6.15.orig/fs/namespace.c linux-2.6.15-ve025stab014/fs/namespace.c +--- linux-2.6.15.orig/fs/namespace.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/namespace.c 2006-01-27 14:48:08.000000000 +0300 +@@ -39,13 +39,15 @@ static inline int sysfs_init(void) + + /* spinlock for vfsmount related operations, inplace of dcache_lock */ + __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); ++EXPORT_SYMBOL(vfsmount_lock); + + static int event; + + static struct list_head *mount_hashtable; + static int hash_mask __read_mostly, hash_bits __read_mostly; + static kmem_cache_t *mnt_cache; +-static struct rw_semaphore namespace_sem; ++struct rw_semaphore namespace_sem; ++EXPORT_SYMBOL(namespace_sem); + + static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) + { +@@ -366,10 +368,32 @@ static int show_vfsmnt(struct seq_file * + { 0, NULL } + }; + struct proc_fs_info *fs_infop; ++ char *path_buf, *path; + +- mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); ++ /* skip FS_NOMOUNT mounts (rootfs) */ ++ if (mnt->mnt_sb->s_flags & MS_NOUSER) ++ return 0; ++ ++ path_buf = (char *) __get_free_page(GFP_KERNEL); ++ if (!path_buf) ++ return -ENOMEM; ++ path = d_path(mnt->mnt_root, mnt, path_buf, PAGE_SIZE); ++ if (IS_ERR(path)) { ++ free_page((unsigned long) path_buf); ++ /* ++ * This means that the file position will be incremented, i.e. ++ * the total number of "invisible" vfsmnt will leak. ++ */ ++ return 0; ++ } ++ ++ if (ve_is_super(get_exec_env())) ++ mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); ++ else ++ mangle(m, mnt->mnt_sb->s_type->name); + seq_putc(m, ' '); +- seq_path(m, mnt, mnt->mnt_root, " \t\n\\"); ++ mangle(m, path); ++ free_page((unsigned long) path_buf); + seq_putc(m, ' '); + mangle(m, mnt->mnt_sb->s_type->name); + seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw"); +@@ -469,6 +493,7 @@ void release_mounts(struct list_head *he + mntput(mnt); + } + } ++EXPORT_SYMBOL(release_mounts); + + void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) + { +@@ -493,6 +518,7 @@ void umount_tree(struct vfsmount *mnt, i + change_mnt_propagation(p, MS_PRIVATE); + } + } ++EXPORT_SYMBOL(umount_tree); + + static int do_umount(struct vfsmount *mnt, int flags) + { +@@ -603,7 +629,7 @@ asmlinkage long sys_umount(char __user * + goto dput_and_out; + + retval = -EPERM; +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + goto dput_and_out; + + retval = do_umount(nd.mnt, flags); +@@ -627,7 +653,7 @@ asmlinkage long sys_oldumount(char __use + + static int mount_is_safe(struct nameidata *nd) + { +- if (capable(CAP_SYS_ADMIN)) ++ if (capable(CAP_VE_SYS_ADMIN)) + return 0; + return -EPERM; + #ifdef notyet +@@ -912,7 +938,7 @@ static int do_remount(struct nameidata * + int err; + struct super_block *sb = nd->mnt->mnt_sb; + +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + + if (!check_mnt(nd->mnt)) +@@ -946,7 +972,7 @@ static int do_move_mount(struct nameidat + struct nameidata old_nd, parent_nd; + struct vfsmount *p; + int err = 0; +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + if (!old_name || !*old_name) + return -EINVAL; +@@ -1026,7 +1052,7 @@ static int do_new_mount(struct nameidata + return -EINVAL; + + /* we need capabilities... */ +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + + mnt = do_kern_mount(type, flags, name, data); +@@ -1067,6 +1093,10 @@ int do_add_mount(struct vfsmount *newmnt + if ((err = graft_tree(newmnt, nd))) + goto unlock; + ++ if (newmnt->mnt_mountpoint->d_flags & DCACHE_VIRTUAL) ++ /* unaccessible yet - no lock */ ++ newmnt->mnt_root->d_flags |= DCACHE_VIRTUAL; ++ + if (fslist) { + /* add to the specified expiration list */ + spin_lock(&vfsmount_lock); +@@ -1494,7 +1524,7 @@ static void chroot_fs_refs(struct nameid + struct fs_struct *fs; + + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_ve(g, p) { + task_lock(p); + fs = p->fs; + if (fs) { +@@ -1509,7 +1539,7 @@ static void chroot_fs_refs(struct nameid + put_fs_struct(fs); + } else + task_unlock(p); +- } while_each_thread(g, p); ++ } while_each_thread_ve(g, p); + read_unlock(&tasklist_lock); + } + +@@ -1658,10 +1688,10 @@ static void __init init_mount_tree(void) + + init_task.namespace = namespace; + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + get_namespace(namespace); + p->namespace = namespace; +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + read_unlock(&tasklist_lock); + + set_fs_pwd(current->fs, namespace->root, namespace->root->mnt_root); +@@ -1677,7 +1707,8 @@ void __init mnt_init(unsigned long mempa + init_rwsem(&namespace_sem); + + mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), +- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL); ++ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_UBC, ++ NULL, NULL); + + mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); + +diff -uprN linux-2.6.15.orig/fs/nfs/nfsroot.c linux-2.6.15-ve025stab014/fs/nfs/nfsroot.c +--- linux-2.6.15.orig/fs/nfs/nfsroot.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/nfs/nfsroot.c 2006-01-27 14:48:08.000000000 +0300 +@@ -310,7 +310,7 @@ static int __init root_nfs_name(char *na + /* Override them by options set on kernel command-line */ + root_nfs_parse(name, buf); + +- cp = system_utsname.nodename; ++ cp = ve_utsname.nodename; + if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) { + printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n"); + return -1; +diff -uprN linux-2.6.15.orig/fs/open.c linux-2.6.15-ve025stab014/fs/open.c +--- linux-2.6.15.orig/fs/open.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/open.c 2006-01-27 14:48:07.000000000 +0300 +@@ -23,6 +23,7 @@ + #include <linux/fs.h> + #include <linux/personality.h> + #include <linux/pagemap.h> ++#include <linux/faudit.h> + #include <linux/syscalls.h> + #include <linux/rcupdate.h> + +@@ -119,6 +120,34 @@ static int vfs_statfs64(struct super_blo + return 0; + } + ++static int faudit_statfs(struct vfsmount *mnt, struct dentry *dentry, ++ struct statfs *buf) ++{ ++ struct faudit_stat_arg arg; ++ ++ arg.mnt = mnt; ++ arg.dentry = dentry; ++ arg.stat = buf; ++ if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STATFS, &arg) ++ != NOTIFY_DONE) ++ return arg.err; ++ return 0; ++} ++ ++static int faudit_statfs64(struct vfsmount *mnt, struct dentry *dentry, ++ struct statfs64 *buf) ++{ ++ struct faudit_stat_arg arg; ++ ++ arg.mnt = mnt; ++ arg.dentry = dentry; ++ arg.stat = buf; ++ if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STATFS64, ++ &arg) != NOTIFY_DONE) ++ return arg.err; ++ return 0; ++} ++ + asmlinkage long sys_statfs(const char __user * path, struct statfs __user * buf) + { + struct nameidata nd; +@@ -128,6 +157,8 @@ asmlinkage long sys_statfs(const char __ + if (!error) { + struct statfs tmp; + error = vfs_statfs_native(nd.dentry->d_inode->i_sb, &tmp); ++ if (!error) ++ error = faudit_statfs(nd.mnt, nd.dentry, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + path_release(&nd); +@@ -147,6 +178,8 @@ asmlinkage long sys_statfs64(const char + if (!error) { + struct statfs64 tmp; + error = vfs_statfs64(nd.dentry->d_inode->i_sb, &tmp); ++ if (!error) ++ error = faudit_statfs64(nd.mnt, nd.dentry, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + path_release(&nd); +@@ -166,6 +199,8 @@ asmlinkage long sys_fstatfs(unsigned int + if (!file) + goto out; + error = vfs_statfs_native(file->f_dentry->d_inode->i_sb, &tmp); ++ if (!error) ++ error = faudit_statfs(file->f_vfsmnt, file->f_dentry, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + fput(file); +@@ -187,6 +222,8 @@ asmlinkage long sys_fstatfs64(unsigned i + if (!file) + goto out; + error = vfs_statfs64(file->f_dentry->d_inode->i_sb, &tmp); ++ if (!error) ++ error = faudit_statfs64(file->f_vfsmnt, file->f_dentry, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + fput(file); +diff -uprN linux-2.6.15.orig/fs/partitions/check.c linux-2.6.15-ve025stab014/fs/partitions/check.c +--- linux-2.6.15.orig/fs/partitions/check.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/partitions/check.c 2006-01-27 14:48:08.000000000 +0300 +@@ -124,6 +124,7 @@ char *disk_name(struct gendisk *hd, int + + return buf; + } ++EXPORT_SYMBOL(disk_name); + + const char *bdevname(struct block_device *bdev, char *buf) + { +diff -uprN linux-2.6.15.orig/fs/proc/array.c linux-2.6.15-ve025stab014/fs/proc/array.c +--- linux-2.6.15.orig/fs/proc/array.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/proc/array.c 2006-01-27 14:48:08.000000000 +0300 +@@ -76,6 +76,8 @@ + #include <linux/cpuset.h> + #include <linux/rcupdate.h> + ++#include <ub/beancounter.h> ++ + #include <asm/uaccess.h> + #include <asm/pgtable.h> + #include <asm/io.h> +@@ -161,8 +163,13 @@ static inline char * task_state(struct t + struct group_info *group_info; + int g; + struct fdtable *fdt = NULL; ++ pid_t pid, ppid, tgid; ++ ++ pid = get_task_pid(p); ++ tgid = get_task_tgid(p); + + read_lock(&tasklist_lock); ++ ppid = get_task_ppid(p); + buffer += sprintf(buffer, + "State:\t%s\n" + "SleepAVG:\t%lu%%\n" +@@ -174,9 +181,9 @@ static inline char * task_state(struct t + "Gid:\t%d\t%d\t%d\t%d\n", + get_task_state(p), + (p->sleep_avg/1024)*100/(1020000000/1024), +- p->tgid, +- p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0, +- pid_alive(p) && p->ptrace ? p->parent->pid : 0, ++ tgid, ++ pid, ppid, ++ pid_alive(p) && p->ptrace ? get_task_pid(p->parent) : 0, + p->uid, p->euid, p->suid, p->fsuid, + p->gid, p->egid, p->sgid, p->fsgid); + read_unlock(&tasklist_lock); +@@ -199,6 +206,14 @@ static inline char * task_state(struct t + put_group_info(group_info); + + buffer += sprintf(buffer, "\n"); ++ ++#ifdef CONFIG_VE ++ buffer += sprintf(buffer, ++ "envID:\t%d\n" ++ "VPid:\t%d\n", ++ VE_TASK_INFO(p)->owner_env->veid, ++ virt_pid(p)); ++#endif + return buffer; + } + +@@ -293,10 +308,27 @@ static inline char *task_cap(struct task + cap_t(p->cap_effective)); + } + ++#ifdef CONFIG_USER_RESOURCE ++static inline void ub_dump_task_info(struct task_struct *tsk, ++ char *stsk, int ltsk, char *smm, int lmm) ++{ ++ print_ub_uid(tsk->task_bc.task_ub, stsk, ltsk); ++ task_lock(tsk); ++ if (tsk->mm) ++ print_ub_uid(tsk->mm->mm_ub, smm, lmm); ++ else ++ strncpy(smm, "N/A", lmm); ++ task_unlock(tsk); ++} ++#endif ++ + int proc_pid_status(struct task_struct *task, char * buffer) + { + char * orig = buffer; + struct mm_struct *mm = get_task_mm(task); ++#ifdef CONFIG_USER_RESOURCE ++ char tsk_ub_info[64], mm_ub_info[64]; ++#endif + + buffer = task_name(task, buffer); + buffer = task_state(task, buffer); +@@ -311,6 +343,14 @@ int proc_pid_status(struct task_struct * + #if defined(CONFIG_ARCH_S390) + buffer = task_show_regs(task, buffer); + #endif ++#ifdef CONFIG_USER_RESOURCE ++ ub_dump_task_info(task, ++ tsk_ub_info, sizeof(tsk_ub_info), ++ mm_ub_info, sizeof(mm_ub_info)); ++ ++ buffer += sprintf(buffer, "TaskUB:\t%s\n", tsk_ub_info); ++ buffer += sprintf(buffer, "MMUB:\t%s\n", mm_ub_info); ++#endif + return buffer - orig; + } + +@@ -333,6 +373,10 @@ static int do_task_stat(struct task_stru + unsigned long it_real_value = 0; + struct task_struct *t; + char tcomm[sizeof(task->comm)]; ++#ifdef CONFIG_USER_RESOURCE ++ char ub_task_info[64]; ++ char ub_mm_info[64]; ++#endif + + state = *get_task_state(task); + vsize = eip = esp = 0; +@@ -370,11 +414,12 @@ static int do_task_stat(struct task_stru + } + if (task->signal) { + if (task->signal->tty) { +- tty_pgrp = task->signal->tty->pgrp; ++ tty_pgrp = pid_type_to_vpid(PIDTYPE_PGID, ++ task->signal->tty->pgrp); + tty_nr = new_encode_dev(tty_devnum(task->signal->tty)); + } +- pgid = process_group(task); +- sid = task->signal->session; ++ pgid = get_task_pgid(task); ++ sid = get_task_sid(task); + cmin_flt = task->signal->cmin_flt; + cmaj_flt = task->signal->cmaj_flt; + cutime = task->signal->cutime; +@@ -388,7 +433,7 @@ static int do_task_stat(struct task_stru + } + it_real_value = task->signal->it_real_value; + } +- ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0; ++ ppid = get_task_ppid(task); + read_unlock(&tasklist_lock); + + if (!whole || num_threads<2) +@@ -407,14 +452,34 @@ static int do_task_stat(struct task_stru + + /* Temporary variable needed for gcc-2.96 */ + /* convert timespec -> nsec*/ ++#ifndef CONFIG_VE + start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC + + task->start_time.tv_nsec; ++#else ++ start_time = (unsigned long long)(task->start_time.tv_sec - ++ get_exec_env()->init_entry->start_time.tv_sec) * ++ NSEC_PER_SEC + task->start_time.tv_nsec - ++ get_exec_env()->init_entry->start_time.tv_nsec; ++#endif + /* convert nsec -> ticks */ + start_time = nsec_to_clock_t(start_time); + ++#ifdef CONFIG_USER_RESOURCE ++ ub_dump_task_info(task, ++ ub_task_info, sizeof(ub_task_info), ++ ub_mm_info, sizeof(ub_mm_info)); ++#endif ++ + res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ + %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \ +-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n", ++%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu" ++#ifdef CONFIG_VE ++"0 0 0 0 0 0 0 0 %d %u" ++#endif ++#ifdef CONFIG_USER_RESOURCE ++ " %s %s" ++#endif ++ "\n", + task->pid, + tcomm, + state, +@@ -459,7 +524,16 @@ static int do_task_stat(struct task_stru + task->exit_signal, + task_cpu(task), + task->rt_priority, +- task->policy); ++ task->policy ++#ifdef CONFIG_VE ++ , virt_pid(task), ++ VEID(VE_TASK_INFO(task)->owner_env) ++#endif ++#ifdef CONFIG_USER_RESOURCE ++ , ub_task_info, ++ ub_mm_info ++#endif ++ ); + if(mm) + mmput(mm); + return res; +diff -uprN linux-2.6.15.orig/fs/proc/base.c linux-2.6.15-ve025stab014/fs/proc/base.c +--- linux-2.6.15.orig/fs/proc/base.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/proc/base.c 2006-01-27 14:48:08.000000000 +0300 +@@ -290,22 +290,25 @@ static int proc_fd_link(struct inode *in + struct files_struct *files; + struct file *file; + int fd = proc_type(inode) - PROC_TID_FD_DIR; ++ int err = -ENOENT; + + files = get_files_struct(task); + if (files) { + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { +- *mnt = mntget(file->f_vfsmnt); +- *dentry = dget(file->f_dentry); +- rcu_read_unlock(); +- put_files_struct(files); +- return 0; ++ if (d_root_check(file->f_dentry, file->f_vfsmnt)) { ++ err = -EACCES; ++ } else { ++ *mnt = mntget(file->f_vfsmnt); ++ *dentry = dget(file->f_dentry); ++ err = 0; ++ } + } + rcu_read_unlock(); + put_files_struct(files); + } +- return -ENOENT; ++ return err; + } + + static struct fs_struct *get_fs_struct(struct task_struct *task) +@@ -325,10 +328,12 @@ static int proc_cwd_link(struct inode *i + int result = -ENOENT; + if (fs) { + read_lock(&fs->lock); +- *mnt = mntget(fs->pwdmnt); +- *dentry = dget(fs->pwd); ++ result = d_root_check(fs->pwd, fs->pwdmnt); ++ if (!result) { ++ *mnt = mntget(fs->pwdmnt); ++ *dentry = dget(fs->pwd); ++ } + read_unlock(&fs->lock); +- result = 0; + put_fs_struct(fs); + } + return result; +@@ -1302,6 +1307,10 @@ static struct inode *proc_pid_make_inode + struct inode * inode; + struct proc_inode *ei; + ++ if (!ve_accessible(VE_TASK_INFO(task)->owner_env, ++ VE_OWNER_FSTYPE(sb->s_type))) ++ return NULL; ++ + /* We need a new inode */ + + inode = new_inode(sb); +@@ -1405,6 +1414,10 @@ static void pid_base_iput(struct dentry + spin_lock(&task->proc_lock); + if (task->proc_dentry == dentry) + task->proc_dentry = NULL; ++#ifdef CONFIG_VE ++ if (VE_TASK_INFO(task)->glob_proc_dentry == dentry) ++ VE_TASK_INFO(task)->glob_proc_dentry = NULL; ++#endif + spin_unlock(&task->proc_lock); + iput(inode); + } +@@ -1878,14 +1891,14 @@ static int proc_self_readlink(struct den + int buflen) + { + char tmp[30]; +- sprintf(tmp, "%d", current->tgid); ++ sprintf(tmp, "%d", get_task_tgid(current)); + return vfs_readlink(dentry,buffer,buflen,tmp); + } + + static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) + { + char tmp[30]; +- sprintf(tmp, "%d", current->tgid); ++ sprintf(tmp, "%d", get_task_tgid(current)); + return ERR_PTR(vfs_follow_link(nd,tmp)); + } + +@@ -1910,11 +1923,8 @@ static struct inode_operations proc_self + * of PIDTYPE_PID. + */ + +-struct dentry *proc_pid_unhash(struct task_struct *p) ++struct dentry *__proc_pid_unhash(struct task_struct *p, struct dentry *proc_dentry) + { +- struct dentry *proc_dentry; +- +- proc_dentry = p->proc_dentry; + if (proc_dentry != NULL) { + + spin_lock(&dcache_lock); +@@ -1932,6 +1942,14 @@ struct dentry *proc_pid_unhash(struct ta + return proc_dentry; + } + ++void proc_pid_unhash(struct task_struct *p, struct dentry *pd[2]) ++{ ++ pd[0] = __proc_pid_unhash(p, p->proc_dentry); ++#ifdef CONFIG_VE ++ pd[1] = __proc_pid_unhash(p, VE_TASK_INFO(p)->glob_proc_dentry); ++#endif ++} ++ + /** + * proc_pid_flush - recover memory used by stale /proc/@pid/x entries + * @proc_dentry: directoy to prune. +@@ -1939,7 +1957,7 @@ struct dentry *proc_pid_unhash(struct ta + * Shrink the /proc directory that was used by the just killed thread. + */ + +-void proc_pid_flush(struct dentry *proc_dentry) ++void __proc_pid_flush(struct dentry *proc_dentry) + { + might_sleep(); + if(proc_dentry != NULL) { +@@ -1948,12 +1966,21 @@ void proc_pid_flush(struct dentry *proc_ + } + } + ++void proc_pid_flush(struct dentry *proc_dentry[2]) ++{ ++ __proc_pid_flush(proc_dentry[0]); ++#ifdef CONFIG_VE ++ __proc_pid_flush(proc_dentry[1]); ++#endif ++} ++ + /* SMP-safe */ + struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) + { + struct task_struct *task; + struct inode *inode; + struct proc_inode *ei; ++ struct dentry *pd[2]; + unsigned tgid; + int died; + +@@ -1977,7 +2004,19 @@ struct dentry *proc_pid_lookup(struct in + goto out; + + read_lock(&tasklist_lock); +- task = find_task_by_pid(tgid); ++ task = find_task_by_pid_ve(tgid); ++ /* In theory we are allowed to lookup both /proc/VIRT_PID and ++ * /proc/GLOBAL_PID inside VE. However, current /proc implementation ++ * cannot maintain two references to one task, so that we have ++ * to prohibit /proc/GLOBAL_PID. ++ */ ++ if (task && !ve_is_super(get_exec_env()) && !is_virtual_pid(tgid)) { ++ /* However, VE_ENTERed tasks are exception, they use global ++ * pids. ++ */ ++ if (virt_pid(task) != tgid) ++ task = NULL; ++ } + if (task) + get_task_struct(task); + read_unlock(&tasklist_lock); +@@ -2006,16 +2045,23 @@ struct dentry *proc_pid_lookup(struct in + died = 0; + d_add(dentry, inode); + spin_lock(&task->proc_lock); ++#ifdef CONFIG_VE ++ if (ve_is_super(VE_OWNER_FSTYPE(inode->i_sb->s_type))) ++ VE_TASK_INFO(task)->glob_proc_dentry = dentry; ++ else ++ task->proc_dentry = dentry; ++#else + task->proc_dentry = dentry; ++#endif + if (!pid_alive(task)) { +- dentry = proc_pid_unhash(task); ++ proc_pid_unhash(task, pd); + died = 1; + } + spin_unlock(&task->proc_lock); + + put_task_struct(task); + if (died) { +- proc_pid_flush(dentry); ++ proc_pid_flush(pd); + goto out; + } + return NULL; +@@ -2036,7 +2082,12 @@ static struct dentry *proc_task_lookup(s + goto out; + + read_lock(&tasklist_lock); +- task = find_task_by_pid(tid); ++ task = find_task_by_pid_ve(tid); ++ /* See comment above in similar place. */ ++ if (task && !ve_is_super(get_exec_env()) && !is_virtual_pid(tid)) { ++ if (virt_pid(task) != tid) ++ task = NULL; ++ } + if (task) + get_task_struct(task); + read_unlock(&tasklist_lock); +@@ -2080,7 +2131,8 @@ out: + * tasklist lock while doing this, and we must release it before + * we actually do the filldir itself, so we use a temp buffer.. + */ +-static int get_tgid_list(int index, unsigned long version, unsigned int *tgids) ++static int get_tgid_list(int index, unsigned long version, unsigned int *tgids, ++ struct ve_struct *ve) + { + struct task_struct *p; + int nr_tgids = 0; +@@ -2089,7 +2141,11 @@ static int get_tgid_list(int index, unsi + read_lock(&tasklist_lock); + p = NULL; + if (version) { +- p = find_task_by_pid(version); ++ struct ve_struct *oldve; ++ ++ oldve = set_exec_env(ve); ++ p = find_task_by_pid_ve(version); ++ (void)set_exec_env(oldve); + if (p && !thread_group_leader(p)) + p = NULL; + } +@@ -2097,10 +2153,10 @@ static int get_tgid_list(int index, unsi + if (p) + index = 0; + else +- p = next_task(&init_task); ++ p = __first_task_ve(ve); + +- for ( ; p != &init_task; p = next_task(p)) { +- int tgid = p->pid; ++ for ( ; p != NULL; p = __next_task_ve(ve, p)) { ++ int tgid = get_task_pid_ve(p, ve); + if (!pid_alive(p)) + continue; + if (--index >= 0) +@@ -2133,7 +2189,7 @@ static int get_tid_list(int index, unsig + * via next_thread(). + */ + if (pid_alive(task)) do { +- int tid = task->pid; ++ int tid = get_task_pid(task); + + if (--index >= 0) + continue; +@@ -2170,7 +2226,8 @@ int proc_pid_readdir(struct file * filp, + next_tgid = filp->f_version; + filp->f_version = 0; + for (;;) { +- nr_tgids = get_tgid_list(nr, next_tgid, tgid_array); ++ nr_tgids = get_tgid_list(nr, next_tgid, tgid_array, ++ filp->f_dentry->d_sb->s_type->owner_env); + if (!nr_tgids) { + /* no more entries ! */ + break; +diff -uprN linux-2.6.15.orig/fs/proc/generic.c linux-2.6.15-ve025stab014/fs/proc/generic.c +--- linux-2.6.15.orig/fs/proc/generic.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/proc/generic.c 2006-01-27 14:48:08.000000000 +0300 +@@ -10,7 +10,9 @@ + + #include <linux/errno.h> + #include <linux/time.h> ++#include <linux/fs.h> + #include <linux/proc_fs.h> ++#include <linux/ve_owner.h> + #include <linux/stat.h> + #include <linux/module.h> + #include <linux/mount.h> +@@ -27,6 +29,8 @@ static ssize_t proc_file_write(struct fi + size_t count, loff_t *ppos); + static loff_t proc_file_lseek(struct file *, loff_t, int); + ++static DEFINE_RWLOCK(proc_tree_lock); ++ + int proc_match(int len, const char *name, struct proc_dir_entry *de) + { + if (de->namelen != len) +@@ -227,6 +231,7 @@ proc_file_lseek(struct file *file, loff_ + return retval; + } + ++#ifndef CONFIG_VE + static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) + { + struct inode *inode = dentry->d_inode; +@@ -259,9 +264,12 @@ static int proc_getattr(struct vfsmount + generic_fillattr(inode, stat); + return 0; + } ++#endif + + static struct inode_operations proc_file_inode_operations = { ++#ifndef CONFIG_VE + .setattr = proc_notify_change, ++#endif + }; + + /* +@@ -269,14 +277,20 @@ static struct inode_operations proc_file + * returns the struct proc_dir_entry for "/proc/tty/driver", and + * returns "serial" in residual. + */ +-static int xlate_proc_name(const char *name, ++static int __xlate_proc_name(struct proc_dir_entry *root, const char *name, + struct proc_dir_entry **ret, const char **residual) + { + const char *cp = name, *next; + struct proc_dir_entry *de; + int len; + +- de = &proc_root; ++ if (*ret) { ++ de_get(*ret); ++ return 0; ++ } ++ ++ read_lock(&proc_tree_lock); ++ de = root; + while (1) { + next = strchr(cp, '/'); + if (!next) +@@ -287,15 +301,35 @@ static int xlate_proc_name(const char *n + if (proc_match(len, cp, de)) + break; + } +- if (!de) ++ if (!de) { ++ read_unlock(&proc_tree_lock); + return -ENOENT; ++ } + cp += len + 1; + } + *residual = cp; +- *ret = de; ++ *ret = de_get(de); ++ read_unlock(&proc_tree_lock); + return 0; + } + ++#ifndef CONFIG_VE ++#define xlate_proc_loc_name xlate_proc_name ++#else ++static int xlate_proc_loc_name(const char *name, ++ struct proc_dir_entry **ret, const char **residual) ++{ ++ return __xlate_proc_name(get_exec_env()->proc_root, ++ name, ret, residual); ++} ++#endif ++ ++static int xlate_proc_name(const char *name, ++ struct proc_dir_entry **ret, const char **residual) ++{ ++ return __xlate_proc_name(&proc_root, name, ret, residual); ++} ++ + static DEFINE_IDR(proc_inum_idr); + static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ + +@@ -367,6 +401,20 @@ static struct dentry_operations proc_den + .d_delete = proc_delete_dentry, + }; + ++static struct proc_dir_entry *__proc_lookup(struct proc_dir_entry *dir, ++ struct dentry *d) ++{ ++ struct proc_dir_entry *de; ++ ++ for (de = dir->subdir; de; de = de->next) { ++ if (de->namelen != d->d_name.len) ++ continue; ++ if (!memcmp(d->d_name.name, de->name, de->namelen)) ++ break; ++ } ++ return de_get(de); ++} ++ + /* + * Don't create negative dentries here, return -ENOENT by hand + * instead. +@@ -374,34 +422,147 @@ static struct dentry_operations proc_den + struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) + { + struct inode *inode = NULL; +- struct proc_dir_entry * de; ++ struct proc_dir_entry *lde, *gde; + int error = -ENOENT; + + lock_kernel(); +- de = PDE(dir); +- if (de) { +- for (de = de->subdir; de ; de = de->next) { +- if (de->namelen != dentry->d_name.len) +- continue; +- if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { +- unsigned int ino = de->low_ino; ++ lde = LPDE(dir); + +- error = -EINVAL; +- inode = proc_get_inode(dir->i_sb, ino, de); +- break; +- } +- } +- } ++ if (!lde) ++ goto out; ++ ++ read_lock(&proc_tree_lock); ++ lde = __proc_lookup(lde, dentry); ++#ifdef CONFIG_VE ++ gde = GPDE(dir); ++ if (gde) ++ gde = __proc_lookup(gde, dentry); ++#else ++ gde = NULL; ++#endif ++ read_unlock(&proc_tree_lock); ++ ++ /* ++ * There are following possible cases after lookup: ++ * ++ * lde gde ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ * NULL NULL ENOENT ++ * loc NULL found in local tree ++ * loc glob found in both trees ++ * NULL glob found in global tree ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ * ++ * We initialized inode as follows after lookup: ++ * ++ * inode->lde inode->gde ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ * loc NULL in local tree ++ * loc glob both trees ++ * glob glob global tree ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ * i.e. inode->lde is always initialized ++ */ ++ ++ if (lde == NULL && gde == NULL) ++ goto out; ++ ++ if (lde != NULL) ++ inode = proc_get_inode(dir->i_sb, lde->low_ino, lde); ++ else ++ inode = proc_get_inode(dir->i_sb, gde->low_ino, gde); ++ ++ /* ++ * We can sleep in proc_get_inode(), but since we have i_sem ++ * being taken, no one can setup GPDE/LPDE on this inode. ++ */ ++ if (!inode) ++ goto out_put; ++ ++#ifdef CONFIG_VE ++ GPDE(inode) = de_get(gde); ++ if (gde) ++ __module_get(gde->owner); ++ ++ /* if dentry is found in both trees and it is a directory ++ * then inode's nlink count must be altered, because local ++ * and global subtrees may differ. ++ * on the other hand, they may intersect, so actual nlink ++ * value is difficult to calculate - upper estimate is used ++ * instead of it. ++ * dentry found in global tree only must not be writable ++ * in non-super ve. ++ */ ++ if (lde && gde && lde != gde && gde->nlink > 1) ++ inode->i_nlink += gde->nlink - 2; ++ if (lde == NULL && !ve_is_super( ++ VE_OWNER_FSTYPE(dir->i_sb->s_type))) ++ inode->i_mode &= ~S_IWUGO; ++#endif + unlock_kernel(); ++ dentry->d_op = &proc_dentry_operations; ++ d_add(dentry, inode); ++ de_put(lde); ++ de_put(gde); ++ return NULL; + +- if (inode) { +- dentry->d_op = &proc_dentry_operations; +- d_add(dentry, inode); +- return NULL; +- } ++out_put: ++ de_put(lde); ++ de_put(gde); ++out: ++ unlock_kernel(); + return ERR_PTR(error); + } + ++struct proc_dir_reader { ++ struct list_head list; ++ struct proc_dir_entry *next; ++}; ++ ++static LIST_HEAD(proc_dir_readers); ++static DEFINE_SPINLOCK(proc_dir_readers_lock); ++ ++static inline void add_reader(struct proc_dir_reader *r, ++ struct proc_dir_entry *cur) ++{ ++ r->next = cur->next; ++ spin_lock(&proc_dir_readers_lock); ++ list_add(&r->list, &proc_dir_readers); ++ spin_unlock(&proc_dir_readers_lock); ++} ++ ++static inline struct proc_dir_entry *del_reader(struct proc_dir_reader *r) ++{ ++ spin_lock(&proc_dir_readers_lock); ++ list_del(&r->list); ++ spin_unlock(&proc_dir_readers_lock); ++ return r->next; ++} ++ ++static void notify_readers(struct proc_dir_entry *de) ++{ ++ struct proc_dir_reader *r; ++ ++ /* lockless since proc_tree_lock is taken for writing */ ++ list_for_each_entry(r, &proc_dir_readers, list) ++ if (r->next == de) ++ r->next = de->next; ++} ++ ++static inline int in_tree(struct proc_dir_entry *de, struct proc_dir_entry *dir) ++{ ++ struct proc_dir_entry *gde; ++ ++ for (gde = dir->subdir; gde; gde = gde->next) { ++ if (de->namelen != gde->namelen) ++ continue; ++ if (memcmp(de->name, gde->name, gde->namelen)) ++ continue; ++ return 1; ++ } ++ return 0; ++} ++ + /* + * This returns non-zero if at EOF, so that the /proc + * root directory can use this and check if it should +@@ -419,6 +580,7 @@ int proc_readdir(struct file * filp, + int i; + struct inode *inode = filp->f_dentry->d_inode; + int ret = 0; ++ struct proc_dir_reader this; + + lock_kernel(); + +@@ -445,13 +607,12 @@ int proc_readdir(struct file * filp, + filp->f_pos++; + /* fall through */ + default: ++ read_lock(&proc_tree_lock); + de = de->subdir; + i -= 2; + for (;;) { +- if (!de) { +- ret = 1; +- goto out; +- } ++ if (!de) ++ goto chk_global; + if (!i) + break; + de = de->next; +@@ -459,12 +620,60 @@ int proc_readdir(struct file * filp, + } + + do { +- if (filldir(dirent, de->name, de->namelen, filp->f_pos, +- de->low_ino, de->mode >> 12) < 0) ++ de_get(de); ++ add_reader(&this, de); ++ read_unlock(&proc_tree_lock); ++ ret = filldir(dirent, de->name, de->namelen, ++ filp->f_pos, de->low_ino, ++ de->mode >> 12); ++ read_lock(&proc_tree_lock); ++ de_put(de); ++ de = del_reader(&this); ++ if (ret < 0) { ++ read_unlock(&proc_tree_lock); ++ ret = 0; + goto out; ++ } + filp->f_pos++; +- de = de->next; + } while (de); ++chk_global: ++#ifdef CONFIG_VE ++ de = GPDE(inode); ++ if (de == NULL) ++ goto done; ++ ++ de = de->subdir; ++ while (de) { ++ if (in_tree(de, LPDE(inode))) { ++ de = de->next; ++ continue; ++ } ++ ++ if (i > 0) { ++ i--; ++ de = de->next; ++ continue; ++ } ++ ++ de_get(de); ++ add_reader(&this, de); ++ read_unlock(&proc_tree_lock); ++ ret = filldir(dirent, de->name, de->namelen, ++ filp->f_pos, de->low_ino, ++ de->mode >> 12); ++ read_lock(&proc_tree_lock); ++ de_put(de); ++ de = del_reader(&this); ++ if (ret < 0) { ++ read_unlock(&proc_tree_lock); ++ ret = 0; ++ goto out; ++ } ++ filp->f_pos++; ++ } ++done: ++#endif ++ read_unlock(&proc_tree_lock); + } + ret = 1; + out: unlock_kernel(); +@@ -486,8 +695,10 @@ static struct file_operations proc_dir_o + */ + static struct inode_operations proc_dir_inode_operations = { + .lookup = proc_lookup, ++#ifndef CONFIG_VE + .getattr = proc_getattr, + .setattr = proc_notify_change, ++#endif + }; + + static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) +@@ -497,10 +708,20 @@ static int proc_register(struct proc_dir + i = get_inode_number(); + if (i == 0) + return -EAGAIN; ++ ++ write_lock(&proc_tree_lock); ++ if (dir->deleted) { ++ write_unlock(&proc_tree_lock); ++ release_inode_number(i); ++ return -ENOENT; ++ } ++ + dp->low_ino = i; + dp->next = dir->subdir; +- dp->parent = dir; ++ dp->parent = de_get(dir); + dir->subdir = dp; ++ write_unlock(&proc_tree_lock); ++ + if (S_ISDIR(dp->mode)) { + if (dp->proc_iops == NULL) { + dp->proc_fops = &proc_dir_operations; +@@ -554,24 +775,26 @@ static struct proc_dir_entry *proc_creat + mode_t mode, + nlink_t nlink) + { +- struct proc_dir_entry *ent = NULL; ++ struct proc_dir_entry *ent; + const char *fn = name; + int len; + + /* make sure name is valid */ +- if (!name || !strlen(name)) goto out; ++ if (!name || !strlen(name)) ++ goto out; + +- if (!(*parent) && xlate_proc_name(name, parent, &fn) != 0) ++ if (xlate_proc_loc_name(name, parent, &fn) != 0) + goto out; + + /* At this point there must not be any '/' characters beyond *fn */ + if (strchr(fn, '/')) +- goto out; ++ goto out_put; + + len = strlen(fn); + + ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); +- if (!ent) goto out; ++ if (!ent) ++ goto out_put; + + memset(ent, 0, sizeof(struct proc_dir_entry)); + memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1); +@@ -579,8 +802,13 @@ static struct proc_dir_entry *proc_creat + ent->namelen = len; + ent->mode = mode; + ent->nlink = nlink; +- out: ++ atomic_set(&ent->count, 1); + return ent; ++ ++out_put: ++ de_put(*parent); ++out: ++ return NULL; + } + + struct proc_dir_entry *proc_symlink(const char *name, +@@ -604,6 +832,7 @@ struct proc_dir_entry *proc_symlink(cons + kfree(ent); + ent = NULL; + } ++ de_put(parent); + } + return ent; + } +@@ -622,6 +851,7 @@ struct proc_dir_entry *proc_mkdir_mode(c + kfree(ent); + ent = NULL; + } ++ de_put(parent); + } + return ent; + } +@@ -660,9 +890,28 @@ struct proc_dir_entry *create_proc_entry + kfree(ent); + ent = NULL; + } ++ de_put(parent); + } + return ent; + } ++EXPORT_SYMBOL(remove_proc_glob_entry); ++ ++struct proc_dir_entry *create_proc_glob_entry(const char *name, mode_t mode, ++ struct proc_dir_entry *parent) ++{ ++ const char *path; ++ struct proc_dir_entry *ent; ++ ++ path = name; ++ if (xlate_proc_name(path, &parent, &name) != 0) ++ return NULL; ++ ++ ent = create_proc_entry(name, mode, parent); ++ de_put(parent); ++ return ent; ++} ++ ++EXPORT_SYMBOL(create_proc_glob_entry); + + void free_proc_entry(struct proc_dir_entry *de) + { +@@ -682,20 +931,21 @@ void free_proc_entry(struct proc_dir_ent + * Remove a /proc entry and free it if it's not currently in use. + * If it is in use, we set the 'deleted' flag. + */ +-void remove_proc_entry(const char *name, struct proc_dir_entry *parent) ++static void __remove_proc_entry(const char *name, struct proc_dir_entry *parent) + { + struct proc_dir_entry **p; + struct proc_dir_entry *de; + const char *fn = name; + int len; + +- if (!parent && xlate_proc_name(name, &parent, &fn) != 0) +- goto out; + len = strlen(fn); ++ write_lock(&proc_tree_lock); + for (p = &parent->subdir; *p; p=&(*p)->next ) { + if (!proc_match(len, fn, *p)) + continue; ++ + de = *p; ++ notify_readers(de); + *p = de->next; + de->next = NULL; + if (S_ISDIR(de->mode)) +@@ -703,15 +953,43 @@ void remove_proc_entry(const char *name, + proc_kill_inodes(de); + de->nlink = 0; + WARN_ON(de->subdir); +- if (!atomic_read(&de->count)) +- free_proc_entry(de); +- else { +- de->deleted = 1; +- printk("remove_proc_entry: %s/%s busy, count=%d\n", +- parent->name, de->name, atomic_read(&de->count)); +- } ++ de->deleted = 1; ++ de_put(de); ++ de_put(parent); + break; + } +-out: +- return; ++ write_unlock(&proc_tree_lock); ++} ++ ++void remove_proc_loc_entry(const char *name, struct proc_dir_entry *parent) ++{ ++ const char *path; ++ ++ path = name; ++ if (xlate_proc_loc_name(path, &parent, &name) != 0) ++ return; ++ ++ __remove_proc_entry(name, parent); ++ de_put(parent); ++} ++ ++void remove_proc_glob_entry(const char *name, struct proc_dir_entry *parent) ++{ ++ const char *path; ++ ++ path = name; ++ if (xlate_proc_name(path, &parent, &name) != 0) ++ return; ++ ++ __remove_proc_entry(name, parent); ++ de_put(parent); ++} ++ ++void remove_proc_entry(const char *name, struct proc_dir_entry *parent) ++{ ++ remove_proc_loc_entry(name, parent); ++#ifdef CONFIG_VE ++ if (ve_is_super(get_exec_env())) ++ remove_proc_glob_entry(name, parent); ++#endif + } +diff -uprN linux-2.6.15.orig/fs/proc/inode.c linux-2.6.15-ve025stab014/fs/proc/inode.c +--- linux-2.6.15.orig/fs/proc/inode.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/proc/inode.c 2006-01-27 14:48:08.000000000 +0300 +@@ -8,6 +8,7 @@ + #include <linux/proc_fs.h> + #include <linux/kernel.h> + #include <linux/mm.h> ++#include <linux/ve_owner.h> + #include <linux/string.h> + #include <linux/stat.h> + #include <linux/file.h> +@@ -21,34 +22,25 @@ + + extern void free_proc_entry(struct proc_dir_entry *); + +-static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de) +-{ +- if (de) +- atomic_inc(&de->count); +- return de; +-} +- + /* + * Decrements the use count and checks for deferred deletion. + */ +-static void de_put(struct proc_dir_entry *de) ++void de_put(struct proc_dir_entry *de) + { + if (de) { +- lock_kernel(); + if (!atomic_read(&de->count)) { + printk("de_put: entry %s already free!\n", de->name); +- unlock_kernel(); + return; + } + + if (atomic_dec_and_test(&de->count)) { +- if (de->deleted) { +- printk("de_put: deferred delete of %s\n", ++ if (unlikely(!de->deleted)) { ++ printk("de_put: early delete of %s\n", + de->name); +- free_proc_entry(de); ++ return; + } ++ free_proc_entry(de); + } +- unlock_kernel(); + } + } + +@@ -68,12 +60,19 @@ static void proc_delete_inode(struct ino + put_task_struct(tsk); + + /* Let go of any associated proc directory entry */ +- de = PROC_I(inode)->pde; ++ de = LPDE(inode); + if (de) { + if (de->owner) + module_put(de->owner); + de_put(de); + } ++#ifdef CONFIG_VE ++ de = GPDE(inode); ++ if (de) { ++ module_put(de->owner); ++ de_put(de); ++ } ++#endif + clear_inode(inode); + } + +@@ -100,6 +99,9 @@ static struct inode *proc_alloc_inode(st + ei->pde = NULL; + inode = &ei->vfs_inode; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; ++#ifdef CONFIG_VE ++ GPDE(inode) = NULL; ++#endif + return inode; + } + +@@ -213,6 +215,12 @@ int proc_fill_super(struct super_block * + s->s_root = d_alloc_root(root_inode); + if (!s->s_root) + goto out_no_root; ++#ifdef CONFIG_VE ++ LPDE(root_inode) = de_get(get_exec_env()->proc_root); ++ GPDE(root_inode) = &proc_root; ++#else ++ LPDE(root_inode) = &proc_root; ++#endif + return 0; + + out_no_root: +diff -uprN linux-2.6.15.orig/fs/proc/proc_misc.c linux-2.6.15-ve025stab014/fs/proc/proc_misc.c +--- linux-2.6.15.orig/fs/proc/proc_misc.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/proc/proc_misc.c 2006-01-27 14:48:08.000000000 +0300 +@@ -31,6 +31,7 @@ + #include <linux/pagemap.h> + #include <linux/swap.h> + #include <linux/slab.h> ++#include <linux/virtinfo.h> + #include <linux/smp.h> + #include <linux/signal.h> + #include <linux/module.h> +@@ -52,8 +53,10 @@ + #include <asm/div64.h> + #include "internal.h" + +-#define LOAD_INT(x) ((x) >> FSHIFT) +-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) ++#ifdef CONFIG_FAIRSCHED ++#include <linux/fairsched.h> ++#endif ++ + /* + * Warning: stuff below (imported functions) assumes that its output will fit + * into one page. For some of those functions it may be wrong. Moreover, we +@@ -84,15 +87,33 @@ static int loadavg_read_proc(char *page, + { + int a, b, c; + int len; +- +- a = avenrun[0] + (FIXED_1/200); +- b = avenrun[1] + (FIXED_1/200); +- c = avenrun[2] + (FIXED_1/200); ++ unsigned long __nr_running; ++ int __nr_threads; ++ unsigned long *__avenrun; ++ struct ve_struct *ve; ++ ++ ve = get_exec_env(); ++ ++ if (ve_is_super(ve)) { ++ __avenrun = &avenrun[0]; ++ __nr_running = nr_running(); ++ __nr_threads = nr_threads; ++ } ++#ifdef CONFIG_VE ++ else { ++ __avenrun = &ve->avenrun[0]; ++ __nr_running = nr_running_ve(ve); ++ __nr_threads = atomic_read(&ve->pcounter); ++ } ++#endif ++ a = __avenrun[0] + (FIXED_1/200); ++ b = __avenrun[1] + (FIXED_1/200); ++ c = __avenrun[2] + (FIXED_1/200); + len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n", + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), +- nr_running(), nr_threads, last_pid); ++ __nr_running, __nr_threads, last_pid); + return proc_calc_metrics(page, start, off, count, eof, len); + } + +@@ -105,6 +126,13 @@ static int uptime_read_proc(char *page, + cputime_t idletime = cputime_add(init_task.utime, init_task.stime); + + do_posix_clock_monotonic_gettime(&uptime); ++#ifdef CONFIG_VE ++ if (!ve_is_super(get_exec_env())) { ++ set_normalized_timespec(&uptime, ++ uptime.tv_sec - get_exec_env()->start_timespec.tv_sec, ++ uptime.tv_nsec - get_exec_env()->start_timespec.tv_nsec); ++ } ++#endif + cputime_to_timespec(idletime, &idle); + len = sprintf(page,"%lu.%02lu %lu.%02lu\n", + (unsigned long) uptime.tv_sec, +@@ -118,35 +146,37 @@ static int uptime_read_proc(char *page, + static int meminfo_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) + { +- struct sysinfo i; ++ struct meminfo mi; + int len; +- struct page_state ps; +- unsigned long inactive; +- unsigned long active; +- unsigned long free; +- unsigned long committed; +- unsigned long allowed; ++ unsigned long dummy; + struct vmalloc_info vmi; +- long cached; + +- get_page_state(&ps); +- get_zone_counts(&active, &inactive, &free); ++ get_page_state(&mi.ps); ++ get_zone_counts(&mi.active, &mi.inactive, &dummy); + + /* + * display in kilobytes. + */ + #define K(x) ((x) << (PAGE_SHIFT - 10)) +- si_meminfo(&i); +- si_swapinfo(&i); +- committed = atomic_read(&vm_committed_space); +- allowed = ((totalram_pages - hugetlb_total_pages()) +- * sysctl_overcommit_ratio / 100) + total_swap_pages; ++ si_meminfo(&mi.si); ++ si_swapinfo(&mi.si); ++ mi.committed_space = atomic_read(&vm_committed_space); ++ mi.swapcache = total_swapcache_pages; ++ mi.cache = get_page_cache_size() - mi.swapcache - mi.si.bufferram; ++ if (mi.cache < 0) ++ mi.cache = 0; + +- cached = get_page_cache_size() - total_swapcache_pages - i.bufferram; +- if (cached < 0) +- cached = 0; ++ mi.vmalloc_total = (VMALLOC_END - VMALLOC_START) >> PAGE_SHIFT; ++ mi.allowed = ((totalram_pages - hugetlb_total_pages()) ++ * sysctl_overcommit_ratio / 100) + total_swap_pages; + + get_vmalloc_info(&vmi); ++ mi.vmalloc_used = vmi.used >> PAGE_SHIFT; ++ mi.vmalloc_largest = vmi.largest_chunk >> PAGE_SHIFT; ++ ++ if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi) ++ & NOTIFY_FAIL) ++ return -ENOMSG; + + /* + * Tagged format, for easy grepping and expansion. +@@ -175,29 +205,29 @@ static int meminfo_read_proc(char *page, + "VmallocTotal: %8lu kB\n" + "VmallocUsed: %8lu kB\n" + "VmallocChunk: %8lu kB\n", +- K(i.totalram), +- K(i.freeram), +- K(i.bufferram), +- K(cached), +- K(total_swapcache_pages), +- K(active), +- K(inactive), +- K(i.totalhigh), +- K(i.freehigh), +- K(i.totalram-i.totalhigh), +- K(i.freeram-i.freehigh), +- K(i.totalswap), +- K(i.freeswap), +- K(ps.nr_dirty), +- K(ps.nr_writeback), +- K(ps.nr_mapped), +- K(ps.nr_slab), +- K(allowed), +- K(committed), +- K(ps.nr_page_table_pages), +- (unsigned long)VMALLOC_TOTAL >> 10, +- vmi.used >> 10, +- vmi.largest_chunk >> 10 ++ K(mi.si.totalram), ++ K(mi.si.freeram), ++ K(mi.si.bufferram), ++ K(mi.cache), ++ K(mi.swapcache), ++ K(mi.active), ++ K(mi.inactive), ++ K(mi.si.totalhigh), ++ K(mi.si.freehigh), ++ K(mi.si.totalram-mi.si.totalhigh), ++ K(mi.si.freeram-mi.si.freehigh), ++ K(mi.si.totalswap), ++ K(mi.si.freeswap), ++ K(mi.ps.nr_dirty), ++ K(mi.ps.nr_writeback), ++ K(mi.ps.nr_mapped), ++ K(mi.ps.nr_slab), ++ K(mi.allowed), ++ K(mi.committed_space), ++ K(mi.ps.nr_page_table_pages), ++ K(mi.vmalloc_total), ++ K(mi.vmalloc_used), ++ K(mi.vmalloc_largest) + ); + + len += hugetlb_report_meminfo(page + len); +@@ -337,18 +367,15 @@ static struct file_operations proc_slabi + .release = seq_release, + }; + +-static int show_stat(struct seq_file *p, void *v) ++static void show_stat_ve0(struct seq_file *p) + { + int i; +- unsigned long jif; ++ struct page_state page_state; + cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; + u64 sum = 0; + + user = nice = system = idle = iowait = + irq = softirq = steal = cputime64_zero; +- jif = - wall_to_monotonic.tv_sec; +- if (wall_to_monotonic.tv_nsec) +- --jif; + + for_each_cpu(i) { + int j; +@@ -402,9 +429,84 @@ static int show_stat(struct seq_file *p, + for (i = 0; i < NR_IRQS; i++) + seq_printf(p, " %u", kstat_irqs(i)); + #endif ++ get_full_page_state(&page_state); ++ seq_printf(p, "\nswap %lu %lu\n", page_state.pswpin, page_state.pswpout); ++} ++ ++#ifdef CONFIG_VE ++static void show_stat_ve(struct seq_file *p, struct ve_struct *env) ++{ ++ int i; ++ u64 user, nice, system; ++ cycles_t idle, iowait; ++ cpumask_t ve_cpus; ++ ++ ve_cpu_online_map(env, &ve_cpus); ++ ++ user = nice = system = idle = iowait = 0; ++ for_each_cpu_mask(i, ve_cpus) { ++ user += VE_CPU_STATS(env, i)->user; ++ nice += VE_CPU_STATS(env, i)->nice; ++ system += VE_CPU_STATS(env, i)->system; ++ idle += ve_sched_get_idle_time(env, i); ++ iowait += ve_sched_get_iowait_time(env, i); ++ } ++ ++ seq_printf(p, "cpu %llu %llu %llu %llu %llu 0 0 0\n", ++ (unsigned long long)cputime64_to_clock_t(user), ++ (unsigned long long)cputime64_to_clock_t(nice), ++ (unsigned long long)cputime64_to_clock_t(system), ++ (unsigned long long)cycles_to_clocks(idle), ++ (unsigned long long)cycles_to_clocks(iowait)); ++ ++ for_each_cpu_mask(i, ve_cpus) { ++ user = VE_CPU_STATS(env, i)->user; ++ nice = VE_CPU_STATS(env, i)->nice; ++ system = VE_CPU_STATS(env, i)->system; ++ idle = ve_sched_get_idle_time(env, i); ++ iowait = ve_sched_get_iowait_time(env, i); ++ seq_printf(p, "cpu%d %llu %llu %llu %llu %llu 0 0 0\n", ++ i, ++ (unsigned long long)cputime64_to_clock_t(user), ++ (unsigned long long)cputime64_to_clock_t(nice), ++ (unsigned long long)cputime64_to_clock_t(system), ++ (unsigned long long)cycles_to_clocks(idle), ++ (unsigned long long)cycles_to_clocks(iowait)); ++ } ++ seq_printf(p, "intr 0\nswap 0 0\n"); ++} ++#endif ++ ++int show_stat(struct seq_file *p, void *v) ++{ ++ extern unsigned long total_forks; ++ unsigned long seq, jif; ++ struct ve_struct *env; ++ unsigned long __nr_running, __nr_iowait; ++ ++ do { ++ seq = read_seqbegin(&xtime_lock); ++ jif = - wall_to_monotonic.tv_sec; ++ if (wall_to_monotonic.tv_nsec) ++ --jif; ++ } while (read_seqretry(&xtime_lock, seq)); ++ ++ env = get_exec_env(); ++ if (ve_is_super(env)) { ++ show_stat_ve0(p); ++ __nr_running = nr_running(); ++ __nr_iowait = nr_iowait(); ++ } ++#ifdef CONFIG_VE ++ else { ++ show_stat_ve(p, env); ++ __nr_running = nr_running_ve(env); ++ __nr_iowait = nr_iowait_ve(env); ++ } ++#endif + + seq_printf(p, +- "\nctxt %llu\n" ++ "ctxt %llu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" +@@ -412,8 +514,8 @@ static int show_stat(struct seq_file *p, + nr_context_switches(), + (unsigned long)jif, + total_forks, +- nr_running(), +- nr_iowait()); ++ __nr_running, ++ __nr_iowait); + + return 0; + } +@@ -510,7 +612,8 @@ static int cmdline_read_proc(char *page, + { + int len; + +- len = sprintf(page, "%s\n", saved_command_line); ++ len = sprintf(page, "%s\n", ++ ve_is_super(get_exec_env()) ? saved_command_line : ""); + return proc_calc_metrics(page, start, off, count, eof, len); + } + +diff -uprN linux-2.6.15.orig/fs/proc/proc_tty.c linux-2.6.15-ve025stab014/fs/proc/proc_tty.c +--- linux-2.6.15.orig/fs/proc/proc_tty.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/proc/proc_tty.c 2006-01-27 14:48:08.000000000 +0300 +@@ -6,6 +6,7 @@ + + #include <asm/uaccess.h> + ++#include <linux/ve_owner.h> + #include <linux/init.h> + #include <linux/errno.h> + #include <linux/time.h> +@@ -106,24 +107,35 @@ static int show_tty_driver(struct seq_fi + /* iterator */ + static void *t_start(struct seq_file *m, loff_t *pos) + { +- struct list_head *p; ++ struct tty_driver *drv; ++ + loff_t l = *pos; +- list_for_each(p, &tty_drivers) ++ read_lock(&tty_driver_guard); ++ list_for_each_entry(drv, &tty_drivers, tty_drivers) { ++ if (!ve_accessible_strict(VE_OWNER_TTYDRV(drv), get_exec_env())) ++ continue; + if (!l--) +- return list_entry(p, struct tty_driver, tty_drivers); ++ return drv; ++ } + return NULL; + } + + static void *t_next(struct seq_file *m, void *v, loff_t *pos) + { +- struct list_head *p = ((struct tty_driver *)v)->tty_drivers.next; ++ struct tty_driver *drv; ++ + (*pos)++; +- return p==&tty_drivers ? NULL : +- list_entry(p, struct tty_driver, tty_drivers); ++ drv = (struct tty_driver *)v; ++ list_for_each_entry_continue(drv, &tty_drivers, tty_drivers) { ++ if (ve_accessible_strict(VE_OWNER_TTYDRV(drv), get_exec_env())) ++ return drv; ++ } ++ return NULL; + } + + static void t_stop(struct seq_file *m, void *v) + { ++ read_unlock(&tty_driver_guard); + } + + static struct seq_operations tty_drivers_op = { +diff -uprN linux-2.6.15.orig/fs/proc/root.c linux-2.6.15-ve025stab014/fs/proc/root.c +--- linux-2.6.15.orig/fs/proc/root.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/proc/root.c 2006-01-27 14:48:08.000000000 +0300 +@@ -18,7 +18,10 @@ + #include <linux/bitops.h> + #include <linux/smp_lock.h> + +-struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc_root_driver; ++#ifndef CONFIG_VE ++struct proc_dir_entry *proc_net, *proc_net_stat; ++#endif ++struct proc_dir_entry *proc_bus, *proc_root_fs, *proc_root_driver; + + #ifdef CONFIG_SYSCTL + struct proc_dir_entry *proc_sys_root; +@@ -36,6 +39,8 @@ static struct file_system_type proc_fs_t + .kill_sb = kill_anon_super, + }; + ++EXPORT_SYMBOL(proc_fs_type); ++ + extern int __init proc_init_inodecache(void); + void __init proc_root_init(void) + { +@@ -155,7 +160,9 @@ EXPORT_SYMBOL(create_proc_entry); + EXPORT_SYMBOL(remove_proc_entry); + EXPORT_SYMBOL(proc_root); + EXPORT_SYMBOL(proc_root_fs); ++#ifndef CONFIG_VE + EXPORT_SYMBOL(proc_net); + EXPORT_SYMBOL(proc_net_stat); ++#endif + EXPORT_SYMBOL(proc_bus); + EXPORT_SYMBOL(proc_root_driver); +diff -uprN linux-2.6.15.orig/fs/proc/task_mmu.c linux-2.6.15-ve025stab014/fs/proc/task_mmu.c +--- linux-2.6.15.orig/fs/proc/task_mmu.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/proc/task_mmu.c 2006-01-27 14:48:08.000000000 +0300 +@@ -90,9 +90,12 @@ int proc_exe_link(struct inode *inode, s + } + + if (vma) { +- *mnt = mntget(vma->vm_file->f_vfsmnt); +- *dentry = dget(vma->vm_file->f_dentry); +- result = 0; ++ result = d_root_check(vma->vm_file->f_dentry, ++ vma->vm_file->f_vfsmnt); ++ if (!result) { ++ *mnt = mntget(vma->vm_file->f_vfsmnt); ++ *dentry = dget(vma->vm_file->f_dentry); ++ } + } + + up_read(&mm->mmap_sem); +diff -uprN linux-2.6.15.orig/fs/proc/task_nommu.c linux-2.6.15-ve025stab014/fs/proc/task_nommu.c +--- linux-2.6.15.orig/fs/proc/task_nommu.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/proc/task_nommu.c 2006-01-27 14:48:08.000000000 +0300 +@@ -126,9 +126,12 @@ int proc_exe_link(struct inode *inode, s + } + + if (vma) { +- *mnt = mntget(vma->vm_file->f_vfsmnt); +- *dentry = dget(vma->vm_file->f_dentry); +- result = 0; ++ result = d_root_check(vma->vm_file->f_dentry, ++ vma->vm_file->f_vfsmnt); ++ if (!result) { ++ *mnt = mntget(vma->vm_file->f_vfsmnt); ++ *dentry = dget(vma->vm_file->f_dentry); ++ } + } + + up_read(&mm->mmap_sem); +diff -uprN linux-2.6.15.orig/fs/quota.c linux-2.6.15-ve025stab014/fs/quota.c +--- linux-2.6.15.orig/fs/quota.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/quota.c 2006-01-27 14:48:08.000000000 +0300 +@@ -80,11 +80,11 @@ static int generic_quotactl_valid(struct + if (cmd == Q_GETQUOTA) { + if (((type == USRQUOTA && current->euid != id) || + (type == GRPQUOTA && !in_egroup_p(id))) && +- !capable(CAP_SYS_ADMIN)) ++ !capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + } + else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO) +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + + return 0; +@@ -131,10 +131,10 @@ static int xqm_quotactl_valid(struct sup + if (cmd == Q_XGETQUOTA) { + if (((type == XQM_USRQUOTA && current->euid != id) || + (type == XQM_GRPQUOTA && !in_egroup_p(id))) && +- !capable(CAP_SYS_ADMIN)) ++ !capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) { +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + } + +@@ -215,7 +215,7 @@ restart: + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); +- if (sb->s_root && sb->s_qcop->quota_sync) ++ if (sb->s_root && sb->s_qcop && sb->s_qcop->quota_sync) + quota_sync_sb(sb, type); + up_read(&sb->s_umount); + spin_lock(&sb_lock); +@@ -357,7 +357,7 @@ asmlinkage long sys_quotactl(unsigned in + tmp = getname(special); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); +- bdev = lookup_bdev(tmp); ++ bdev = lookup_bdev(tmp, FMODE_QUOTACTL); + putname(tmp); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); +diff -uprN linux-2.6.15.orig/fs/reiserfs/namei.c linux-2.6.15-ve025stab014/fs/reiserfs/namei.c +--- linux-2.6.15.orig/fs/reiserfs/namei.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/reiserfs/namei.c 2006-01-27 14:48:08.000000000 +0300 +@@ -868,6 +868,9 @@ static int reiserfs_rmdir(struct inode * + INITIALIZE_PATH(path); + struct reiserfs_dir_entry de; + ++ inode = dentry->d_inode; ++ DQUOT_INIT(inode); ++ + /* we will be doing 2 balancings and update 2 stat data, we change quotas + * of the owner of the directory and of the owner of the parent directory. + * The quota structure is possibly deleted only on last iput => outside +@@ -892,8 +895,6 @@ static int reiserfs_rmdir(struct inode * + goto end_rmdir; + } + +- inode = dentry->d_inode; +- + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + +@@ -956,6 +957,7 @@ static int reiserfs_unlink(struct inode + unsigned long savelink; + + inode = dentry->d_inode; ++ DQUOT_INIT(inode); + + /* in this transaction we can be doing at max two balancings and update + * two stat datas, we change quotas of the owner of the directory and of +@@ -1263,6 +1265,8 @@ static int reiserfs_rename(struct inode + + old_inode = old_dentry->d_inode; + new_dentry_inode = new_dentry->d_inode; ++ if (new_dentry_inode) ++ DQUOT_INIT(new_dentry_inode); + + // make sure, that oldname still exists and points to an object we + // are going to rename +diff -uprN linux-2.6.15.orig/fs/select.c linux-2.6.15-ve025stab014/fs/select.c +--- linux-2.6.15.orig/fs/select.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/select.c 2006-01-27 14:48:05.000000000 +0300 +@@ -24,6 +24,8 @@ + #include <linux/fs.h> + #include <linux/rcupdate.h> + ++#include <ub/ub_mem.h> ++ + #include <asm/uaccess.h> + + #define ROUND_UP(x,y) (((x)+(y)-1)/(y)) +@@ -276,7 +278,7 @@ int do_select(int n, fd_set_bits *fds, l + + static void *select_bits_alloc(int size) + { +- return kmalloc(6 * size, GFP_KERNEL); ++ return ub_kmalloc(6 * size, GFP_KERNEL); + } + + static void select_bits_free(void *bits, int size) +@@ -498,7 +500,7 @@ asmlinkage long sys_poll(struct pollfd _ + err = -ENOMEM; + while(i!=0) { + struct poll_list *pp; +- pp = kmalloc(sizeof(struct poll_list)+ ++ pp = ub_kmalloc(sizeof(struct poll_list)+ + sizeof(struct pollfd)* + (i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i), + GFP_KERNEL); +diff -uprN linux-2.6.15.orig/fs/seq_file.c linux-2.6.15-ve025stab014/fs/seq_file.c +--- linux-2.6.15.orig/fs/seq_file.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/seq_file.c 2006-01-27 14:48:08.000000000 +0300 +@@ -345,6 +345,8 @@ int seq_path(struct seq_file *m, + if (m->count < m->size) { + char *s = m->buf + m->count; + char *p = d_path(dentry, mnt, s, m->size - m->count); ++ if (IS_ERR(p) && PTR_ERR(p) != -ENAMETOOLONG) ++ return 0; + if (!IS_ERR(p)) { + while (s <= p) { + char c = *p++; +diff -uprN linux-2.6.15.orig/fs/simfs.c linux-2.6.15-ve025stab014/fs/simfs.c +--- linux-2.6.15.orig/fs/simfs.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/simfs.c 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,319 @@ ++/* ++ * fs/simfs.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/fs.h> ++#include <linux/file.h> ++#include <linux/init.h> ++#include <linux/namei.h> ++#include <linux/err.h> ++#include <linux/module.h> ++#include <linux/mount.h> ++#include <linux/vzquota.h> ++#include <linux/statfs.h> ++#include <linux/virtinfo.h> ++#include <linux/faudit.h> ++#include <linux/genhd.h> ++ ++#include <asm/unistd.h> ++#include <asm/uaccess.h> ++ ++#define SIMFS_GET_LOWER_FS_SB(sb) sb->s_root->d_sb ++ ++static struct super_operations sim_super_ops; ++ ++static int sim_getattr(struct vfsmount *mnt, struct dentry *dentry, ++ struct kstat *stat) ++{ ++ struct super_block *sb; ++ struct inode *inode; ++ ++ inode = dentry->d_inode; ++ if (!inode->i_op->getattr) { ++ generic_fillattr(inode, stat); ++ if (!stat->blksize) { ++ unsigned blocks; ++ ++ sb = inode->i_sb; ++ blocks = (stat->size + sb->s_blocksize-1) >> ++ sb->s_blocksize_bits; ++ stat->blocks = (sb->s_blocksize / 512) * blocks; ++ stat->blksize = sb->s_blocksize; ++ } ++ } else { ++ int err; ++ ++ err = inode->i_op->getattr(mnt, dentry, stat); ++ if (err) ++ return err; ++ } ++ ++ sb = mnt->mnt_sb; ++ if (sb->s_op == &sim_super_ops) ++ stat->dev = sb->s_dev; ++ return 0; ++} ++ ++static void quota_get_stat(struct super_block *sb, struct kstatfs *buf) ++{ ++ int err; ++ struct dq_stat qstat; ++ struct virt_info_quota q; ++ long free_file, adj_file; ++ s64 blk, free_blk, adj_blk; ++ int bsize_bits; ++ ++ q.super = sb; ++ q.qstat = &qstat; ++ err = virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_GETSTAT, &q); ++ if (err != NOTIFY_OK) ++ return; ++ ++ bsize_bits = ffs(buf->f_bsize) - 1; ++ free_blk = (s64)(qstat.bsoftlimit - qstat.bcurrent) >> bsize_bits; ++ if (free_blk < 0) ++ free_blk = 0; ++ /* ++ * In the regular case, we always set buf->f_bfree and buf->f_blocks to ++ * the values reported by quota. In case of real disk space shortage, ++ * we adjust the values. We want this adjustment to look as if the ++ * total disk space were reduced, not as if the usage were increased. ++ * -- SAW ++ */ ++ adj_blk = 0; ++ if (buf->f_bfree < free_blk) ++ adj_blk = free_blk - buf->f_bfree; ++ buf->f_bfree = (long)(free_blk - adj_blk); ++ ++ if (free_blk < buf->f_bavail) ++ buf->f_bavail = (long)free_blk; /* min(f_bavail, free_blk) */ ++ ++ blk = (qstat.bsoftlimit >> bsize_bits) - adj_blk; ++ buf->f_blocks = blk > LONG_MAX ? LONG_MAX : blk; ++ ++ free_file = qstat.isoftlimit - qstat.icurrent; ++ if (free_file < 0) ++ free_file = 0; ++ if (buf->f_ffree == -1) ++ /* ++ * One filesystem uses -1 to represent the fact that it doesn't ++ * have a detached limit for inode number. ++ * May be, because -1 is a good pretendent for the maximum value ++ * of signed long type, may be, because it's just nice to have ++ * an exceptional case... Guess what that filesystem is :-) ++ * -- SAW ++ */ ++ buf->f_ffree = free_file; ++ adj_file = 0; ++ if (buf->f_ffree < free_file) ++ adj_file = free_file - buf->f_ffree; ++ buf->f_ffree = free_file - adj_file; ++ buf->f_files = qstat.isoftlimit - adj_file; ++} ++ ++static int sim_statfs(struct super_block *sb, struct statfs *buf) ++{ ++ int err; ++ struct super_block *lsb; ++ struct kstatfs statbuf; ++ ++ err = 0; ++ if (sb->s_op != &sim_super_ops) ++ goto out; ++ ++ lsb = SIMFS_GET_LOWER_FS_SB(sb); ++ ++ err = -ENOSYS; ++ if (lsb && lsb->s_op && lsb->s_op->statfs) ++ err = lsb->s_op->statfs(lsb, &statbuf); ++ if (err) ++ goto out; ++ ++ quota_get_stat(sb, &statbuf); ++ ++ buf->f_files = statbuf.f_files; ++ buf->f_ffree = statbuf.f_ffree; ++ buf->f_blocks = statbuf.f_blocks; ++ buf->f_bfree = statbuf.f_bfree; ++ buf->f_bavail = statbuf.f_bavail; ++out: ++ return err; ++} ++ ++static int sim_statfs64(struct super_block *sb, struct statfs64 *buf) ++{ ++ int err; ++ struct super_block *lsb; ++ struct kstatfs statbuf; ++ ++ err = 0; ++ if (sb->s_op != &sim_super_ops) ++ goto out; ++ ++ lsb = SIMFS_GET_LOWER_FS_SB(sb); ++ ++ err = -ENOSYS; ++ if (lsb && lsb->s_op && lsb->s_op->statfs) ++ err = lsb->s_op->statfs(lsb, &statbuf); ++ if (err) ++ goto out; ++ ++ quota_get_stat(sb, &statbuf); ++ ++ buf->f_files = (__u64)statbuf.f_files; ++ buf->f_ffree = (__u64)statbuf.f_ffree; ++ buf->f_blocks = (__u64)statbuf.f_blocks; ++ buf->f_bfree = (__u64)statbuf.f_bfree; ++ buf->f_bavail = (__u64)statbuf.f_bavail; ++out: ++ return err; ++} ++ ++static int sim_systemcall(struct vnotifier_block *me, unsigned long n, ++ void *d, int old_ret) ++{ ++ int err; ++ struct faudit_stat_arg *arg; ++ ++ arg = (struct faudit_stat_arg *)d; ++ switch (n) { ++ case VIRTINFO_FAUDIT_STAT: ++ err = sim_getattr(arg->mnt, arg->dentry, ++ (struct kstat *)arg->stat); ++ break; ++ case VIRTINFO_FAUDIT_STATFS: ++ err = sim_statfs(arg->mnt->mnt_sb, ++ (struct statfs *)arg->stat); ++ break; ++ case VIRTINFO_FAUDIT_STATFS64: ++ err = sim_statfs64(arg->mnt->mnt_sb, ++ (struct statfs64 *)arg->stat); ++ break; ++ default: ++ return old_ret; ++ } ++ arg->err = err; ++ return (err ? NOTIFY_BAD : NOTIFY_OK); ++} ++ ++static struct inode *sim_quota_root(struct super_block *sb) ++{ ++ return sb->s_root->d_inode; ++} ++ ++void sim_put_super(struct super_block *sb) ++{ ++ struct virt_info_quota viq; ++ ++ viq.super = sb; ++ virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_OFF, &viq); ++ bdput(sb->s_bdev); ++} ++ ++static struct super_operations sim_super_ops = { ++ .get_quota_root = sim_quota_root, ++ .put_super = sim_put_super, ++}; ++ ++static int sim_fill_super(struct super_block *s, void *data) ++{ ++ int err; ++ struct nameidata *nd; ++ ++ err = set_anon_super(s, NULL); ++ if (err) ++ goto out; ++ ++ err = 0; ++ nd = (struct nameidata *)data; ++ s->s_root = dget(nd->dentry); ++ s->s_op = &sim_super_ops; ++out: ++ return err; ++} ++ ++struct super_block *sim_get_sb(struct file_system_type *type, ++ int flags, const char *dev_name, void *opt) ++{ ++ int err; ++ struct nameidata nd; ++ struct super_block *sb; ++ struct block_device *bd; ++ struct virt_info_quota viq; ++ static struct hd_struct fake_hds; ++ ++ sb = ERR_PTR(-EINVAL); ++ if (opt == NULL) ++ goto out; ++ ++ err = path_lookup(opt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); ++ sb = ERR_PTR(err); ++ if (err) ++ goto out; ++ ++ sb = sget(type, NULL, sim_fill_super, &nd); ++ if (IS_ERR(sb)) ++ goto out_path; ++ ++ bd = bdget(sb->s_dev); ++ if (!bd) ++ goto out_killsb; ++ ++ sb->s_bdev = bd; ++ bd->bd_part = &fake_hds; ++ viq.super = sb; ++ virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_ON, &viq); ++out_path: ++ path_release(&nd); ++out: ++ return sb; ++ ++out_killsb: ++ up_write(&sb->s_umount); ++ deactivate_super(sb); ++ sb = ERR_PTR(-ENODEV); ++ goto out_path; ++} ++ ++static struct file_system_type sim_fs_type = { ++ .owner = THIS_MODULE, ++ .name = "simfs", ++ .get_sb = sim_get_sb, ++ .kill_sb = kill_anon_super, ++}; ++ ++static struct vnotifier_block sim_syscalls = { ++ .notifier_call = sim_systemcall, ++}; ++ ++static int __init init_simfs(void) ++{ ++ int err; ++ ++ err = register_filesystem(&sim_fs_type); ++ if (err) ++ return err; ++ ++ virtinfo_notifier_register(VITYPE_FAUDIT, &sim_syscalls); ++ return 0; ++} ++ ++static void __exit exit_simfs(void) ++{ ++ virtinfo_notifier_unregister(VITYPE_FAUDIT, &sim_syscalls); ++ unregister_filesystem(&sim_fs_type); ++} ++ ++MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); ++MODULE_DESCRIPTION("Open Virtuozzo Simulation of File System"); ++MODULE_LICENSE("GPL v2"); ++ ++module_init(init_simfs); ++module_exit(exit_simfs); +diff -uprN linux-2.6.15.orig/fs/stat.c linux-2.6.15-ve025stab014/fs/stat.c +--- linux-2.6.15.orig/fs/stat.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/stat.c 2006-01-27 14:48:07.000000000 +0300 +@@ -15,6 +15,7 @@ + #include <linux/namei.h> + #include <linux/security.h> + #include <linux/syscalls.h> ++#include <linux/faudit.h> + + #include <asm/uaccess.h> + #include <asm/unistd.h> +@@ -42,11 +43,19 @@ int vfs_getattr(struct vfsmount *mnt, st + { + struct inode *inode = dentry->d_inode; + int retval; ++ struct faudit_stat_arg arg; + + retval = security_inode_getattr(mnt, dentry); + if (retval) + return retval; + ++ arg.mnt = mnt; ++ arg.dentry = dentry; ++ arg.stat = stat; ++ if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STAT, &arg) ++ != NOTIFY_DONE) ++ return arg.err; ++ + if (inode->i_op->getattr) + return inode->i_op->getattr(mnt, dentry, stat); + +diff -uprN linux-2.6.15.orig/fs/super.c linux-2.6.15-ve025stab014/fs/super.c +--- linux-2.6.15.orig/fs/super.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/super.c 2006-01-27 14:48:08.000000000 +0300 +@@ -23,6 +23,7 @@ + #include <linux/config.h> + #include <linux/module.h> + #include <linux/slab.h> ++#include <linux/ve_owner.h> + #include <linux/init.h> + #include <linux/smp_lock.h> + #include <linux/acct.h> +@@ -69,6 +70,7 @@ static struct super_block *alloc_super(v + INIT_LIST_HEAD(&s->s_io); + INIT_LIST_HEAD(&s->s_files); + INIT_LIST_HEAD(&s->s_instances); ++ INIT_LIST_HEAD(&s->s_dshrinkers); + INIT_HLIST_HEAD(&s->s_anon); + INIT_LIST_HEAD(&s->s_inodes); + init_rwsem(&s->s_umount); +@@ -231,8 +233,9 @@ void generic_shutdown_super(struct super + if (root) { + sb->s_root = NULL; + shrink_dcache_parent(root); +- shrink_dcache_anon(&sb->s_anon); ++ shrink_dcache_anon(sb); + dput(root); ++ dcache_shrinker_wait_sb(sb); + fsync_super(sb); + lock_super(sb); + sb->s_flags &= ~MS_ACTIVE; +@@ -480,11 +483,20 @@ asmlinkage long sys_ustat(unsigned dev, + struct super_block *s; + struct ustat tmp; + struct kstatfs sbuf; +- int err = -EINVAL; ++ dev_t kdev; ++ int err; ++ ++ kdev = new_decode_dev(dev); ++#ifdef CONFIG_VE ++ err = get_device_perms_ve(S_IFBLK, kdev, FMODE_READ); ++ if (err) ++ goto out; ++#endif + +- s = user_get_super(new_decode_dev(dev)); +- if (s == NULL) +- goto out; ++ err = -EINVAL; ++ s = user_get_super(kdev); ++ if (s == NULL) ++ goto out; + err = vfs_statfs(s, &sbuf); + drop_super(s); + if (err) +@@ -598,6 +610,13 @@ void emergency_remount(void) + static struct idr unnamed_dev_idr; + static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ + ++/* for compatibility with coreutils still unaware of new minor sizes */ ++int unnamed_dev_majors[] = { ++ 0, 144, 145, 146, 242, 243, 244, 245, ++ 246, 247, 248, 249, 250, 251, 252, 253 ++}; ++EXPORT_SYMBOL(unnamed_dev_majors); ++ + int set_anon_super(struct super_block *s, void *data) + { + int dev; +@@ -615,13 +634,13 @@ int set_anon_super(struct super_block *s + else if (error) + return -EAGAIN; + +- if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) { ++ if ((dev & MAX_ID_MASK) >= (1 << MINORBITS)) { + spin_lock(&unnamed_dev_lock); + idr_remove(&unnamed_dev_idr, dev); + spin_unlock(&unnamed_dev_lock); + return -EMFILE; + } +- s->s_dev = MKDEV(0, dev & MINORMASK); ++ s->s_dev = make_unnamed_dev(dev); + return 0; + } + +@@ -629,8 +648,9 @@ EXPORT_SYMBOL(set_anon_super); + + void kill_anon_super(struct super_block *sb) + { +- int slot = MINOR(sb->s_dev); ++ int slot; + ++ slot = unnamed_dev_idx(sb->s_dev); + generic_shutdown_super(sb); + spin_lock(&unnamed_dev_lock); + idr_remove(&unnamed_dev_idr, slot); +diff -uprN linux-2.6.15.orig/fs/sysfs/bin.c linux-2.6.15-ve025stab014/fs/sysfs/bin.c +--- linux-2.6.15.orig/fs/sysfs/bin.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/sysfs/bin.c 2006-01-27 14:48:08.000000000 +0300 +@@ -120,6 +120,9 @@ static int open(struct inode * inode, st + struct bin_attribute * attr = to_bin_attr(file->f_dentry); + int error = -EINVAL; + ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + if (!kobj || !attr) + goto Done; + +@@ -196,6 +199,9 @@ int sysfs_create_bin_file(struct kobject + + int sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) + { ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + sysfs_hash_and_remove(kobj->dentry,attr->attr.name); + return 0; + } +diff -uprN linux-2.6.15.orig/fs/sysfs/dir.c linux-2.6.15-ve025stab014/fs/sysfs/dir.c +--- linux-2.6.15.orig/fs/sysfs/dir.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/sysfs/dir.c 2006-01-27 14:48:08.000000000 +0300 +@@ -140,6 +140,9 @@ int sysfs_create_dir(struct kobject * ko + struct dentry * parent; + int error = 0; + ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + BUG_ON(!kobj); + + if (kobj->parent) +@@ -274,10 +277,14 @@ void sysfs_remove_subdir(struct dentry * + + void sysfs_remove_dir(struct kobject * kobj) + { +- struct dentry * dentry = dget(kobj->dentry); ++ struct dentry * dentry; + struct sysfs_dirent * parent_sd; + struct sysfs_dirent * sd, * tmp; + ++ if (!ve_sysfs_alowed()) ++ return; ++ ++ dentry = dget(kobj->dentry); + if (!dentry) + return; + +@@ -305,6 +312,9 @@ int sysfs_rename_dir(struct kobject * ko + int error = 0; + struct dentry * new_dentry, * parent; + ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + if (!strcmp(kobject_name(kobj), new_name)) + return -EINVAL; + +diff -uprN linux-2.6.15.orig/fs/sysfs/file.c linux-2.6.15-ve025stab014/fs/sysfs/file.c +--- linux-2.6.15.orig/fs/sysfs/file.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/sysfs/file.c 2006-01-27 14:48:08.000000000 +0300 +@@ -380,6 +380,9 @@ int sysfs_add_file(struct dentry * dir, + + int sysfs_create_file(struct kobject * kobj, const struct attribute * attr) + { ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + BUG_ON(!kobj || !kobj->dentry || !attr); + + return sysfs_add_file(kobj->dentry, attr, SYSFS_KOBJ_ATTR); +@@ -398,6 +401,9 @@ int sysfs_update_file(struct kobject * k + struct dentry * victim; + int res = -ENOENT; + ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + down(&dir->d_inode->i_sem); + victim = lookup_one_len(attr->name, dir, strlen(attr->name)); + if (!IS_ERR(victim)) { +@@ -473,6 +479,9 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); + + void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) + { ++ if (!ve_sysfs_alowed()) ++ return; ++ + sysfs_hash_and_remove(kobj->dentry,attr->name); + } + +diff -uprN linux-2.6.15.orig/fs/sysfs/group.c linux-2.6.15-ve025stab014/fs/sysfs/group.c +--- linux-2.6.15.orig/fs/sysfs/group.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/sysfs/group.c 2006-01-27 14:48:08.000000000 +0300 +@@ -46,6 +46,9 @@ int sysfs_create_group(struct kobject * + struct dentry * dir; + int error; + ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + BUG_ON(!kobj || !kobj->dentry); + + if (grp->name) { +@@ -68,6 +71,9 @@ void sysfs_remove_group(struct kobject * + { + struct dentry * dir; + ++ if (!ve_sysfs_alowed()) ++ return; ++ + if (grp->name) + dir = lookup_one_len(grp->name, kobj->dentry, + strlen(grp->name)); +diff -uprN linux-2.6.15.orig/fs/sysfs/inode.c linux-2.6.15-ve025stab014/fs/sysfs/inode.c +--- linux-2.6.15.orig/fs/sysfs/inode.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/sysfs/inode.c 2006-01-27 14:48:08.000000000 +0300 +@@ -8,13 +8,12 @@ + + #undef DEBUG + ++#include <linux/config.h> + #include <linux/pagemap.h> + #include <linux/namei.h> + #include <linux/backing-dev.h> + #include "sysfs.h" + +-extern struct super_block * sysfs_sb; +- + static struct address_space_operations sysfs_aops = { + .readpage = simple_readpage, + .prepare_write = simple_prepare_write, +diff -uprN linux-2.6.15.orig/fs/sysfs/mount.c linux-2.6.15-ve025stab014/fs/sysfs/mount.c +--- linux-2.6.15.orig/fs/sysfs/mount.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/sysfs/mount.c 2006-01-27 14:48:08.000000000 +0300 +@@ -7,6 +7,7 @@ + #include <linux/fs.h> + #include <linux/mount.h> + #include <linux/pagemap.h> ++#include <linux/module.h> + #include <linux/init.h> + + #include "sysfs.h" +@@ -14,8 +15,11 @@ + /* Random magic number */ + #define SYSFS_MAGIC 0x62656572 + ++#ifndef CONFIG_VE_SYSFS + struct vfsmount *sysfs_mount; + struct super_block * sysfs_sb = NULL; ++#endif ++ + kmem_cache_t *sysfs_dir_cachep; + + static struct super_operations sysfs_ops = { +@@ -78,6 +82,8 @@ static struct file_system_type sysfs_fs_ + .kill_sb = kill_litter_super, + }; + ++EXPORT_SYMBOL(sysfs_fs_type); ++ + int __init sysfs_init(void) + { + int err = -ENOMEM; +diff -uprN linux-2.6.15.orig/fs/sysfs/symlink.c linux-2.6.15-ve025stab014/fs/sysfs/symlink.c +--- linux-2.6.15.orig/fs/sysfs/symlink.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/sysfs/symlink.c 2006-01-27 14:48:08.000000000 +0300 +@@ -86,6 +86,9 @@ int sysfs_create_link(struct kobject * k + + BUG_ON(!kobj || !kobj->dentry || !name); + ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + down(&dentry->d_inode->i_sem); + error = sysfs_add_link(dentry, name, target); + up(&dentry->d_inode->i_sem); +@@ -101,6 +104,9 @@ int sysfs_create_link(struct kobject * k + + void sysfs_remove_link(struct kobject * kobj, const char * name) + { ++ if(!ve_sysfs_alowed()) ++ return; ++ + sysfs_hash_and_remove(kobj->dentry,name); + } + +diff -uprN linux-2.6.15.orig/fs/sysfs/sysfs.h linux-2.6.15-ve025stab014/fs/sysfs/sysfs.h +--- linux-2.6.15.orig/fs/sysfs/sysfs.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/sysfs/sysfs.h 2006-01-27 14:48:08.000000000 +0300 +@@ -1,5 +1,14 @@ + +-extern struct vfsmount * sysfs_mount; ++#ifndef CONFIG_VE_SYSFS ++extern struct vfsmount *sysfs_mount; ++extern struct super_block *sysfs_sb; ++#define ve_sysfs_alowed() (ve_is_super(get_exec_env())) ++#else ++#define sysfs_mount (get_exec_env()->sysfs_mnt) ++#define sysfs_sb (get_exec_env()->sysfs_sb) ++#define ve_sysfs_alowed() (1) ++#endif ++ + extern kmem_cache_t *sysfs_dir_cachep; + + extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *); +@@ -19,7 +28,6 @@ extern void sysfs_drop_dentry(struct sys + extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); + + extern struct rw_semaphore sysfs_rename_sem; +-extern struct super_block * sysfs_sb; + extern struct file_operations sysfs_dir_operations; + extern struct file_operations sysfs_file_operations; + extern struct file_operations bin_fops; +diff -uprN linux-2.6.15.orig/fs/vzdq_file.c linux-2.6.15-ve025stab014/fs/vzdq_file.c +--- linux-2.6.15.orig/fs/vzdq_file.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/vzdq_file.c 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,852 @@ ++/* ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains Virtuozzo quota files as proc entry implementation. ++ * It is required for std quota tools to work correctly as they are expecting ++ * aquota.user and aquota.group files. ++ */ ++ ++#include <linux/ctype.h> ++#include <linux/slab.h> ++#include <linux/list.h> ++#include <linux/module.h> ++#include <linux/proc_fs.h> ++#include <linux/sysctl.h> ++#include <linux/mount.h> ++#include <linux/namespace.h> ++#include <linux/quotaio_v2.h> ++#include <asm/uaccess.h> ++ ++#include <linux/ve.h> ++#include <linux/ve_proto.h> ++#include <linux/vzdq_tree.h> ++#include <linux/vzquota.h> ++ ++/* ---------------------------------------------------------------------- ++ * ++ * File read operation ++ * ++ * FIXME: functions in this section (as well as many functions in vzdq_ugid.c, ++ * perhaps) abuse vz_quota_sem. ++ * Taking a global semaphore for lengthy and user-controlled operations inside ++ * VPSs is not a good idea in general. ++ * In this case, the reasons for taking this semaphore are completely unclear, ++ * especially taking into account that the only function that has comments ++ * about the necessity to be called under this semaphore ++ * (create_proc_quotafile) is actually called OUTSIDE it. ++ * ++ * --------------------------------------------------------------------- */ ++ ++#define DQBLOCK_SIZE 1024 ++#define DQUOTBLKNUM 21U ++#define DQTREE_DEPTH 4 ++#define TREENUM_2_BLKNUM(num) (((num) + 1) << 1) ++#define ISINDBLOCK(num) ((num)%2 != 0) ++#define FIRST_DATABLK 2 /* first even number */ ++#define LAST_IND_LEVEL (DQTREE_DEPTH - 1) ++#define CONVERT_LEVEL(level) ((level) * (QUOTAID_EBITS/QUOTAID_BBITS)) ++#define GETLEVINDX(ind, lev) (((ind) >> QUOTAID_BBITS*(lev)) \ ++ & QUOTATREE_BMASK) ++ ++#if (QUOTAID_EBITS / QUOTAID_BBITS) != (QUOTATREE_DEPTH / DQTREE_DEPTH) ++#error xBITS and DQTREE_DEPTH does not correspond ++#endif ++ ++#define BLOCK_NOT_FOUND 1 ++ ++/* data for quota file -- one per proc entry */ ++struct quotatree_data { ++ struct list_head list; ++ struct vz_quota_master *qmblk; ++ int type; /* type of the tree */ ++}; ++ ++/* serialized by vz_quota_sem */ ++static LIST_HEAD(qf_data_head); ++ ++static const u_int32_t vzquota_magics[] = V2_INITQMAGICS; ++static const u_int32_t vzquota_versions[] = V2_INITQVERSIONS; ++ ++static inline loff_t get_depoff(int depth) ++{ ++ loff_t res = 1; ++ while (depth) { ++ res += (1 << ((depth - 1)*QUOTAID_EBITS + 1)); ++ depth--; ++ } ++ return res; ++} ++ ++static inline loff_t get_blknum(loff_t num, int depth) ++{ ++ loff_t res; ++ res = (num << 1) + get_depoff(depth); ++ return res; ++} ++ ++static int get_depth(loff_t num) ++{ ++ int i; ++ for (i = 0; i < DQTREE_DEPTH; i++) { ++ if (num >= get_depoff(i) && (i == DQTREE_DEPTH - 1 ++ || num < get_depoff(i + 1))) ++ return i; ++ } ++ return -1; ++} ++ ++static inline loff_t get_offset(loff_t num) ++{ ++ loff_t res, tmp; ++ ++ tmp = get_depth(num); ++ if (tmp < 0) ++ return -1; ++ num -= get_depoff(tmp); ++ BUG_ON(num < 0); ++ res = num >> 1; ++ ++ return res; ++} ++ ++static inline loff_t get_quot_blk_num(struct quotatree_tree *tree, int level) ++{ ++ /* return maximum available block num */ ++ return tree->levels[level].freenum; ++} ++ ++static inline loff_t get_block_num(struct quotatree_tree *tree) ++{ ++ loff_t ind_blk_num, quot_blk_num, max_ind, max_quot; ++ ++ quot_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH) - 1); ++ max_quot = TREENUM_2_BLKNUM(quot_blk_num); ++ ind_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH - 1)); ++ max_ind = (quot_blk_num) ? get_blknum(ind_blk_num, LAST_IND_LEVEL) ++ : get_blknum(ind_blk_num, 0); ++ ++ return (max_ind > max_quot) ? max_ind + 1 : max_quot + 1; ++} ++ ++/* Write quota file header */ ++static int read_header(void *buf, struct quotatree_tree *tree, ++ struct dq_info *dq_ugid_info, int type) ++{ ++ struct v2_disk_dqheader *dqh; ++ struct v2_disk_dqinfo *dq_disk_info; ++ ++ dqh = buf; ++ dq_disk_info = buf + sizeof(struct v2_disk_dqheader); ++ ++ dqh->dqh_magic = vzquota_magics[type]; ++ dqh->dqh_version = vzquota_versions[type]; ++ ++ dq_disk_info->dqi_bgrace = dq_ugid_info[type].bexpire; ++ dq_disk_info->dqi_igrace = dq_ugid_info[type].iexpire; ++ dq_disk_info->dqi_flags = 0; /* no flags */ ++ dq_disk_info->dqi_blocks = get_block_num(tree); ++ dq_disk_info->dqi_free_blk = 0; /* first block in the file */ ++ dq_disk_info->dqi_free_entry = FIRST_DATABLK; ++ ++ return 0; ++} ++ ++static int get_block_child(int depth, struct quotatree_node *p, u_int32_t *buf) ++{ ++ int i, j, lev_num; ++ ++ lev_num = QUOTATREE_DEPTH/DQTREE_DEPTH - 1; ++ for (i = 0; i < BLOCK_SIZE/sizeof(u_int32_t); i++) { ++ struct quotatree_node *next, *parent; ++ ++ parent = p; ++ next = p; ++ for (j = lev_num; j >= 0; j--) { ++ if (!next->blocks[GETLEVINDX(i,j)]) { ++ buf[i] = 0; ++ goto bad_branch; ++ } ++ parent = next; ++ next = next->blocks[GETLEVINDX(i,j)]; ++ } ++ buf[i] = (depth == DQTREE_DEPTH - 1) ? ++ TREENUM_2_BLKNUM(parent->num) ++ : get_blknum(next->num, depth + 1); ++ ++ bad_branch: ++ ; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Write index block to disk (or buffer) ++ * @buf has length 256*sizeof(u_int32_t) bytes ++ */ ++static int read_index_block(int num, u_int32_t *buf, ++ struct quotatree_tree *tree) ++{ ++ struct quotatree_node *p; ++ u_int32_t index; ++ loff_t off; ++ int depth, res; ++ ++ res = BLOCK_NOT_FOUND; ++ index = 0; ++ depth = get_depth(num); ++ off = get_offset(num); ++ if (depth < 0 || off < 0) ++ return -EINVAL; ++ ++ list_for_each_entry(p, &tree->levels[CONVERT_LEVEL(depth)].usedlh, ++ list) { ++ if (p->num >= off) ++ res = 0; ++ if (p->num != off) ++ continue; ++ get_block_child(depth, p, buf); ++ break; ++ } ++ ++ return res; ++} ++ ++static inline void convert_quot_format(struct v2_disk_dqblk *dq, ++ struct vz_quota_ugid *vzq) ++{ ++ dq->dqb_id = vzq->qugid_id; ++ dq->dqb_ihardlimit = vzq->qugid_stat.ihardlimit; ++ dq->dqb_isoftlimit = vzq->qugid_stat.isoftlimit; ++ dq->dqb_curinodes = vzq->qugid_stat.icurrent; ++ dq->dqb_bhardlimit = vzq->qugid_stat.bhardlimit / QUOTABLOCK_SIZE; ++ dq->dqb_bsoftlimit = vzq->qugid_stat.bsoftlimit / QUOTABLOCK_SIZE; ++ dq->dqb_curspace = vzq->qugid_stat.bcurrent; ++ dq->dqb_btime = vzq->qugid_stat.btime; ++ dq->dqb_itime = vzq->qugid_stat.itime; ++} ++ ++static int read_dquot(loff_t num, void *buf, struct quotatree_tree *tree) ++{ ++ int res, i, entries = 0; ++ struct v2_disk_dqdbheader *dq_header; ++ struct quotatree_node *p; ++ struct v2_disk_dqblk *blk = buf + sizeof(struct v2_disk_dqdbheader); ++ ++ res = BLOCK_NOT_FOUND; ++ dq_header = buf; ++ memset(dq_header, 0, sizeof(*dq_header)); ++ ++ list_for_each_entry(p, &(tree->levels[QUOTATREE_DEPTH - 1].usedlh), ++ list) { ++ if (TREENUM_2_BLKNUM(p->num) >= num) ++ res = 0; ++ if (TREENUM_2_BLKNUM(p->num) != num) ++ continue; ++ ++ for (i = 0; i < QUOTATREE_BSIZE; i++) { ++ if (!p->blocks[i]) ++ continue; ++ convert_quot_format(blk + entries, ++ (struct vz_quota_ugid *)p->blocks[i]); ++ entries++; ++ res = 0; ++ } ++ break; ++ } ++ dq_header->dqdh_entries = entries; ++ ++ return res; ++} ++ ++static int read_block(int num, void *buf, struct quotatree_tree *tree, ++ struct dq_info *dq_ugid_info, int magic) ++{ ++ int res; ++ ++ memset(buf, 0, DQBLOCK_SIZE); ++ if (!num) ++ res = read_header(buf, tree, dq_ugid_info, magic); ++ else if (ISINDBLOCK(num)) ++ res = read_index_block(num, (u_int32_t*)buf, tree); ++ else ++ res = read_dquot(num, buf, tree); ++ ++ return res; ++} ++ ++/* ++ * FIXME: this function can handle quota files up to 2GB only. ++ */ ++static int read_proc_quotafile(char *page, char **start, off_t off, int count, ++ int *eof, void *data) ++{ ++ off_t blk_num, blk_off, buf_off; ++ char *tmp; ++ size_t buf_size; ++ struct quotatree_data *qtd; ++ struct quotatree_tree *tree; ++ struct dq_info *dqi; ++ int res; ++ ++ qtd = data; ++ down(&vz_quota_sem); ++ down(&qtd->qmblk->dq_sem); ++ ++ res = 0; ++ tree = QUGID_TREE(qtd->qmblk, qtd->type); ++ if (!tree) { ++ *eof = 1; ++ goto out_dq; ++ } ++ ++ res = -ENOMEM; ++ tmp = kmalloc(DQBLOCK_SIZE, GFP_KERNEL); ++ if (!tmp) ++ goto out_dq; ++ ++ dqi = &qtd->qmblk->dq_ugid_info[qtd->type]; ++ ++ buf_off = 0; ++ buf_size = count; ++ blk_num = off / DQBLOCK_SIZE; ++ blk_off = off % DQBLOCK_SIZE; ++ ++ while (buf_size > 0) { ++ off_t len; ++ ++ len = min((size_t)(DQBLOCK_SIZE-blk_off), buf_size); ++ res = read_block(blk_num, tmp, tree, dqi, qtd->type); ++ if (res < 0) ++ goto out_err; ++ if (res == BLOCK_NOT_FOUND) { ++ *eof = 1; ++ break; ++ } ++ memcpy(page + buf_off, tmp + blk_off, len); ++ ++ blk_num++; ++ buf_size -= len; ++ blk_off = 0; ++ buf_off += len; ++ } ++ res = buf_off; ++ ++out_err: ++ kfree(tmp); ++ *start = NULL + count; ++out_dq: ++ up(&qtd->qmblk->dq_sem); ++ up(&vz_quota_sem); ++ ++ return res; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * /proc/vz/vzaquota/QID/aquota.* files ++ * ++ * FIXME: this code lacks serialization of read/readdir/lseek. ++ * However, this problem should be fixed after the mainstream issue of what ++ * appears to be non-atomic read and update of file position in sys_read. ++ * ++ * --------------------------------------------------------------------- */ ++ ++static inline unsigned long vzdq_aquot_getino(dev_t dev) ++{ ++ return 0xec000000UL + dev; ++} ++ ++static inline dev_t vzdq_aquot_getidev(struct inode *inode) ++{ ++ return (dev_t)(unsigned long)PROC_I(inode)->op.proc_get_link; ++} ++ ++static inline void vzdq_aquot_setidev(struct inode *inode, dev_t dev) ++{ ++ PROC_I(inode)->op.proc_get_link = (void *)(unsigned long)dev; ++} ++ ++static ssize_t vzdq_aquotf_read(struct file *file, ++ char __user *buf, size_t size, loff_t *ppos) ++{ ++ char *page; ++ size_t bufsize; ++ ssize_t l, l2, copied; ++ char *start; ++ struct inode *inode; ++ struct block_device *bdev; ++ struct super_block *sb; ++ struct quotatree_data data; ++ int eof, err; ++ ++ err = -ENOMEM; ++ page = (char *)__get_free_page(GFP_KERNEL); ++ if (page == NULL) ++ goto out_err; ++ ++ err = -ENODEV; ++ inode = file->f_dentry->d_inode; ++ bdev = bdget(vzdq_aquot_getidev(inode)); ++ if (bdev == NULL) ++ goto out_err; ++ sb = get_super(bdev); ++ bdput(bdev); ++ if (sb == NULL) ++ goto out_err; ++ data.qmblk = vzquota_find_qmblk(sb); ++ data.type = PROC_I(inode)->type - 1; ++ drop_super(sb); ++ if (data.qmblk == NULL || data.qmblk == VZ_QUOTA_BAD) ++ goto out_err; ++ ++ copied = 0; ++ l = l2 = 0; ++ while (1) { ++ bufsize = min(size, (size_t)PAGE_SIZE); ++ if (bufsize <= 0) ++ break; ++ ++ l = read_proc_quotafile(page, &start, *ppos, bufsize, ++ &eof, &data); ++ if (l <= 0) ++ break; ++ ++ l2 = copy_to_user(buf, page, l); ++ copied += l - l2; ++ if (l2) ++ break; ++ ++ buf += l; ++ size -= l; ++ *ppos += (unsigned long)start; ++ l = l2 = 0; ++ } ++ ++ qmblk_put(data.qmblk); ++ free_page((unsigned long)page); ++ if (copied) ++ return copied; ++ else if (l2) /* last copy_to_user failed */ ++ return -EFAULT; ++ else /* read error or EOF */ ++ return l; ++ ++out_err: ++ if (page != NULL) ++ free_page((unsigned long)page); ++ return err; ++} ++ ++static struct file_operations vzdq_aquotf_file_operations = { ++ .read = &vzdq_aquotf_read, ++}; ++ ++static struct inode_operations vzdq_aquotf_inode_operations = { ++}; ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * /proc/vz/vzaquota/QID directory ++ * ++ * --------------------------------------------------------------------- */ ++ ++static int vzdq_aquotq_readdir(struct file *file, void *data, filldir_t filler) ++{ ++ loff_t n; ++ int err; ++ ++ n = file->f_pos; ++ for (err = 0; !err; n++) { ++ switch (n) { ++ case 0: ++ err = (*filler)(data, ".", 1, n, ++ file->f_dentry->d_inode->i_ino, ++ DT_DIR); ++ break; ++ case 1: ++ err = (*filler)(data, "..", 2, n, ++ parent_ino(file->f_dentry), DT_DIR); ++ break; ++ case 2: ++ err = (*filler)(data, "aquota.user", 11, n, ++ file->f_dentry->d_inode->i_ino ++ + USRQUOTA + 1, ++ DT_REG); ++ break; ++ case 3: ++ err = (*filler)(data, "aquota.group", 12, n, ++ file->f_dentry->d_inode->i_ino ++ + GRPQUOTA + 1, ++ DT_REG); ++ break; ++ default: ++ goto out; ++ } ++ } ++out: ++ file->f_pos = n; ++ return err; ++} ++ ++struct vzdq_aquotq_lookdata { ++ dev_t dev; ++ int type; ++}; ++ ++static int vzdq_aquotq_looktest(struct inode *inode, void *data) ++{ ++ struct vzdq_aquotq_lookdata *d; ++ ++ d = data; ++ return inode->i_op == &vzdq_aquotf_inode_operations && ++ vzdq_aquot_getidev(inode) == d->dev && ++ PROC_I(inode)->type == d->type + 1; ++} ++ ++static int vzdq_aquotq_lookset(struct inode *inode, void *data) ++{ ++ struct vzdq_aquotq_lookdata *d; ++ ++ d = data; ++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; ++ inode->i_ino = vzdq_aquot_getino(d->dev) + d->type + 1; ++ inode->i_mode = S_IFREG | S_IRUSR; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_op = &vzdq_aquotf_inode_operations; ++ inode->i_fop = &vzdq_aquotf_file_operations; ++ PROC_I(inode)->type = d->type + 1; ++ vzdq_aquot_setidev(inode, d->dev); ++ return 0; ++} ++ ++static struct dentry *vzdq_aquotq_lookup(struct inode *dir, ++ struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct inode *inode; ++ struct vzdq_aquotq_lookdata d; ++ int k; ++ ++ if (dentry->d_name.len == 11) { ++ if (memcmp(dentry->d_name.name, "aquota.user", 11)) ++ goto out; ++ k = USRQUOTA; ++ } else if (dentry->d_name.len == 12) { ++ if (memcmp(dentry->d_name.name, "aquota.group", 11)) ++ goto out; ++ k = GRPQUOTA; ++ } else ++ goto out; ++ d.dev = vzdq_aquot_getidev(dir); ++ d.type = k; ++ inode = iget5_locked(dir->i_sb, dir->i_ino + k + 1, ++ vzdq_aquotq_looktest, vzdq_aquotq_lookset, &d); ++ if (inode == NULL) ++ goto out; ++ unlock_new_inode(inode); ++ d_add(dentry, inode); ++ return NULL; ++ ++out: ++ return ERR_PTR(-ENOENT); ++} ++ ++static struct file_operations vzdq_aquotq_file_operations = { ++ .read = &generic_read_dir, ++ .readdir = &vzdq_aquotq_readdir, ++}; ++ ++static struct inode_operations vzdq_aquotq_inode_operations = { ++ .lookup = &vzdq_aquotq_lookup, ++}; ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * /proc/vz/vzaquota directory ++ * ++ * --------------------------------------------------------------------- */ ++ ++struct vzdq_aquot_de { ++ struct list_head list; ++ struct vfsmount *mnt; ++}; ++ ++static int vzdq_aquot_buildmntlist(struct ve_struct *ve, ++ struct list_head *head) ++{ ++ struct vfsmount *rmnt, *mnt; ++ struct vzdq_aquot_de *p; ++ int err; ++ ++#ifdef CONFIG_VE ++ rmnt = mntget(ve->fs_rootmnt); ++#else ++ read_lock(¤t->fs->lock); ++ rmnt = mntget(current->fs->rootmnt); ++ read_unlock(¤t->fs->lock); ++#endif ++ mnt = rmnt; ++ spin_lock(&vfsmount_lock); ++ while (1) { ++ list_for_each_entry(p, head, list) { ++ if (p->mnt->mnt_sb == mnt->mnt_sb) ++ goto skip; ++ } ++ ++ err = -ENOMEM; ++ p = kmalloc(sizeof(*p), GFP_KERNEL); ++ if (p == NULL) ++ goto out; ++ p->mnt = mntget(mnt); ++ list_add_tail(&p->list, head); ++ ++skip: ++ err = 0; ++ if (list_empty(&mnt->mnt_mounts)) { ++ while (1) { ++ if (mnt == rmnt) ++ goto out; ++ if (mnt->mnt_child.next != ++ &mnt->mnt_parent->mnt_mounts) ++ break; ++ mnt = mnt->mnt_parent; ++ } ++ mnt = list_entry(mnt->mnt_child.next, ++ struct vfsmount, mnt_child); ++ } else ++ mnt = list_entry(mnt->mnt_mounts.next, ++ struct vfsmount, mnt_child); ++ } ++out: ++ spin_unlock(&vfsmount_lock); ++ mntput(rmnt); ++ return err; ++} ++ ++static void vzdq_aquot_releasemntlist(struct ve_struct *ve, ++ struct list_head *head) ++{ ++ struct vzdq_aquot_de *p; ++ ++ while (!list_empty(head)) { ++ p = list_entry(head->next, typeof(*p), list); ++ mntput(p->mnt); ++ list_del(&p->list); ++ kfree(p); ++ } ++} ++ ++static int vzdq_aquotd_readdir(struct file *file, void *data, filldir_t filler) ++{ ++ struct ve_struct *ve, *old_ve; ++ struct list_head mntlist; ++ struct vzdq_aquot_de *de; ++ struct super_block *sb; ++ struct vz_quota_master *qmblk; ++ loff_t i, n; ++ char buf[24]; ++ int l, err; ++ ++ i = 0; ++ n = file->f_pos; ++ ve = VE_OWNER_FSTYPE(file->f_dentry->d_sb->s_type); ++ old_ve = set_exec_env(ve); ++ ++ INIT_LIST_HEAD(&mntlist); ++#ifdef CONFIG_VE ++ /* ++ * The only reason of disabling readdir for the host system is that ++ * this readdir can be slow and CPU consuming with large number of VPSs ++ * (or just mount points). ++ */ ++ err = ve_is_super(ve); ++#else ++ err = 0; ++#endif ++ if (!err) { ++ err = vzdq_aquot_buildmntlist(ve, &mntlist); ++ if (err) ++ goto out_err; ++ } ++ ++ if (i >= n) { ++ if ((*filler)(data, ".", 1, i, ++ file->f_dentry->d_inode->i_ino, DT_DIR)) ++ goto out_fill; ++ } ++ i++; ++ ++ if (i >= n) { ++ if ((*filler)(data, "..", 2, i, ++ parent_ino(file->f_dentry), DT_DIR)) ++ goto out_fill; ++ } ++ i++; ++ ++ list_for_each_entry (de, &mntlist, list) { ++ sb = de->mnt->mnt_sb; ++#ifdef CONFIG_VE ++ if (get_device_perms_ve(S_IFBLK, sb->s_dev, FMODE_QUOTACTL)) ++ continue; ++#endif ++ qmblk = vzquota_find_qmblk(sb); ++ if (qmblk == NULL || qmblk == VZ_QUOTA_BAD) ++ continue; ++ ++ qmblk_put(qmblk); ++ i++; ++ if (i <= n) ++ continue; ++ ++ l = sprintf(buf, "%08x", new_encode_dev(sb->s_dev)); ++ if ((*filler)(data, buf, l, i - 1, ++ vzdq_aquot_getino(sb->s_dev), DT_DIR)) ++ break; ++ } ++ ++out_fill: ++ err = 0; ++ file->f_pos = i; ++out_err: ++ vzdq_aquot_releasemntlist(ve, &mntlist); ++ (void)set_exec_env(old_ve); ++ return err; ++} ++ ++static int vzdq_aquotd_looktest(struct inode *inode, void *data) ++{ ++ return inode->i_op == &vzdq_aquotq_inode_operations && ++ vzdq_aquot_getidev(inode) == (dev_t)(unsigned long)data; ++} ++ ++static int vzdq_aquotd_lookset(struct inode *inode, void *data) ++{ ++ dev_t dev; ++ ++ dev = (dev_t)(unsigned long)data; ++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; ++ inode->i_ino = vzdq_aquot_getino(dev); ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 2; ++ inode->i_op = &vzdq_aquotq_inode_operations; ++ inode->i_fop = &vzdq_aquotq_file_operations; ++ vzdq_aquot_setidev(inode, dev); ++ return 0; ++} ++ ++static struct dentry *vzdq_aquotd_lookup(struct inode *dir, ++ struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct ve_struct *ve, *old_ve; ++ const unsigned char *s; ++ int l; ++ dev_t dev; ++ struct inode *inode; ++ ++ ve = VE_OWNER_FSTYPE(dir->i_sb->s_type); ++ old_ve = set_exec_env(ve); ++#ifdef CONFIG_VE ++ /* ++ * Lookup is much lighter than readdir, so it can be allowed for the ++ * host system. But it would be strange to be able to do lookup only ++ * without readdir... ++ */ ++ if (ve_is_super(ve)) ++ goto out; ++#endif ++ ++ dev = 0; ++ l = dentry->d_name.len; ++ if (l <= 0) ++ goto out; ++ for (s = dentry->d_name.name; l > 0; s++, l--) { ++ if (!isxdigit(*s)) ++ goto out; ++ if (dev & ~(~0UL >> 4)) ++ goto out; ++ dev <<= 4; ++ if (isdigit(*s)) ++ dev += *s - '0'; ++ else if (islower(*s)) ++ dev += *s - 'a' + 10; ++ else ++ dev += *s - 'A' + 10; ++ } ++ dev = new_decode_dev(dev); ++ ++#ifdef CONFIG_VE ++ if (get_device_perms_ve(S_IFBLK, dev, FMODE_QUOTACTL)) ++ goto out; ++#endif ++ ++ inode = iget5_locked(dir->i_sb, vzdq_aquot_getino(dev), ++ vzdq_aquotd_looktest, vzdq_aquotd_lookset, ++ (void *)(unsigned long)dev); ++ if (inode == NULL) ++ goto out; ++ unlock_new_inode(inode); ++ ++ d_add(dentry, inode); ++ (void)set_exec_env(old_ve); ++ return NULL; ++ ++out: ++ (void)set_exec_env(old_ve); ++ return ERR_PTR(-ENOENT); ++} ++ ++static struct file_operations vzdq_aquotd_file_operations = { ++ .read = &generic_read_dir, ++ .readdir = &vzdq_aquotd_readdir, ++}; ++ ++static struct inode_operations vzdq_aquotd_inode_operations = { ++ .lookup = &vzdq_aquotd_lookup, ++}; ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Initialization and deinitialization ++ * ++ * --------------------------------------------------------------------- */ ++ ++/* ++ * FIXME: creation of proc entries here is unsafe with respect to module ++ * unloading. ++ */ ++void vzaquota_init(void) ++{ ++ struct proc_dir_entry *de; ++ ++ de = create_proc_glob_entry("vz/vzaquota", ++ S_IFDIR | S_IRUSR | S_IXUSR, NULL); ++ if (de != NULL) { ++ de->proc_iops = &vzdq_aquotd_inode_operations; ++ de->proc_fops = &vzdq_aquotd_file_operations; ++ } else ++ printk("VZDQ: vz/vzaquota creation failed\n"); ++#if defined(CONFIG_SYSCTL) ++ de = create_proc_glob_entry("sys/fs/quota", ++ S_IFDIR | S_IRUSR | S_IXUSR, NULL); ++ if (de == NULL) ++ printk("VZDQ: sys/fs/quota creation failed\n"); ++#endif ++} ++ ++void vzaquota_fini(void) ++{ ++} +diff -uprN linux-2.6.15.orig/fs/vzdq_mgmt.c linux-2.6.15-ve025stab014/fs/vzdq_mgmt.c +--- linux-2.6.15.orig/fs/vzdq_mgmt.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/vzdq_mgmt.c 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,714 @@ ++/* ++ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ */ ++ ++#include <linux/config.h> ++#include <linux/kernel.h> ++#include <linux/string.h> ++#include <linux/list.h> ++#include <asm/semaphore.h> ++#include <linux/sched.h> ++#include <linux/fs.h> ++#include <linux/dcache.h> ++#include <linux/mount.h> ++#include <linux/namei.h> ++#include <linux/writeback.h> ++#include <linux/gfp.h> ++#include <asm/uaccess.h> ++#include <linux/proc_fs.h> ++#include <linux/quota.h> ++#include <linux/vzctl_quota.h> ++#include <linux/vzquota.h> ++ ++ ++/* ---------------------------------------------------------------------- ++ * Switching quota on. ++ * --------------------------------------------------------------------- */ ++ ++/* ++ * check limits copied from user ++ */ ++int vzquota_check_sane_limits(struct dq_stat *qstat) ++{ ++ int err; ++ ++ err = -EINVAL; ++ ++ /* softlimit must be less then hardlimit */ ++ if (qstat->bsoftlimit > qstat->bhardlimit) ++ goto out; ++ ++ if (qstat->isoftlimit > qstat->ihardlimit) ++ goto out; ++ ++ err = 0; ++out: ++ return err; ++} ++ ++/* ++ * check usage values copied from user ++ */ ++int vzquota_check_sane_values(struct dq_stat *qstat) ++{ ++ int err; ++ ++ err = -EINVAL; ++ ++ /* expiration time must not be set if softlimit was not exceeded */ ++ if (qstat->bcurrent < qstat->bsoftlimit && qstat->btime != (time_t)0) ++ goto out; ++ ++ if (qstat->icurrent < qstat->isoftlimit && qstat->itime != (time_t)0) ++ goto out; ++ ++ err = vzquota_check_sane_limits(qstat); ++out: ++ return err; ++} ++ ++/* ++ * create new quota master block ++ * this function should: ++ * - copy limits and usage parameters from user buffer; ++ * - allock, initialize quota block and insert it to hash; ++ */ ++static int vzquota_create(unsigned int quota_id, struct vz_quota_stat *u_qstat) ++{ ++ int err; ++ struct vz_quota_stat qstat; ++ struct vz_quota_master *qmblk; ++ ++ down(&vz_quota_sem); ++ ++ err = -EFAULT; ++ if (copy_from_user(&qstat, u_qstat, sizeof(qstat))) ++ goto out; ++ ++ err = -EINVAL; ++ if (quota_id == 0) ++ goto out; ++ ++ if (vzquota_check_sane_values(&qstat.dq_stat)) ++ goto out; ++ err = 0; ++ qmblk = vzquota_alloc_master(quota_id, &qstat); ++ ++ if (IS_ERR(qmblk)) /* ENOMEM or EEXIST */ ++ err = PTR_ERR(qmblk); ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++/** ++ * vzquota_on - turn quota on ++ * ++ * This function should: ++ * - find and get refcnt of directory entry for quota root and corresponding ++ * mountpoint; ++ * - find corresponding quota block and mark it with given path; ++ * - check quota tree; ++ * - initialize quota for the tree root. ++ */ ++static int vzquota_on(unsigned int quota_id, const char *quota_root) ++{ ++ int err; ++ struct nameidata nd; ++ struct vz_quota_master *qmblk; ++ struct super_block *dqsb; ++ ++ dqsb = NULL; ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EBUSY; ++ if (qmblk->dq_state != VZDQ_STARTING) ++ goto out; ++ ++ err = user_path_walk(quota_root, &nd); ++ if (err) ++ goto out; ++ /* init path must be a directory */ ++ err = -ENOTDIR; ++ if (!S_ISDIR(nd.dentry->d_inode->i_mode)) ++ goto out_path; ++ ++ qmblk->dq_root_dentry = nd.dentry; ++ qmblk->dq_root_mnt = nd.mnt; ++ qmblk->dq_sb = nd.dentry->d_inode->i_sb; ++ err = vzquota_get_super(qmblk->dq_sb); ++ if (err) ++ goto out_super; ++ ++ /* ++ * Serialization with quota initialization and operations is performed ++ * through generation check: generation is memorized before qmblk is ++ * found and compared under inode_qmblk_lock with assignment. ++ * ++ * Note that the dentry tree is shrunk only for high-level logical ++ * serialization, purely as a courtesy to the user: to have consistent ++ * quota statistics, files should be closed etc. on quota on. ++ */ ++ err = vzquota_on_qmblk(qmblk->dq_sb, qmblk->dq_root_dentry->d_inode, ++ qmblk); ++ if (err) ++ goto out_init; ++ qmblk->dq_state = VZDQ_WORKING; ++ ++ up(&vz_quota_sem); ++ return 0; ++ ++out_init: ++ dqsb = qmblk->dq_sb; ++out_super: ++ /* clear for qmblk_put/quota_free_master */ ++ qmblk->dq_sb = NULL; ++ qmblk->dq_root_dentry = NULL; ++ qmblk->dq_root_mnt = NULL; ++out_path: ++ path_release(&nd); ++out: ++ if (dqsb) ++ vzquota_put_super(dqsb); ++ up(&vz_quota_sem); ++ return err; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * Switching quota off. ++ * --------------------------------------------------------------------- */ ++ ++/* ++ * destroy quota block by ID ++ */ ++static int vzquota_destroy(unsigned int quota_id) ++{ ++ int err; ++ struct vz_quota_master *qmblk; ++ struct dentry *dentry; ++ struct vfsmount *mnt; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EBUSY; ++ if (qmblk->dq_state == VZDQ_WORKING) ++ goto out; /* quota_off first */ ++ ++ list_del_init(&qmblk->dq_hash); ++ dentry = qmblk->dq_root_dentry; ++ qmblk->dq_root_dentry = NULL; ++ mnt = qmblk->dq_root_mnt; ++ qmblk->dq_root_mnt = NULL; ++ ++ if (qmblk->dq_sb) ++ vzquota_put_super(qmblk->dq_sb); ++ up(&vz_quota_sem); ++ ++ qmblk_put(qmblk); ++ dput(dentry); ++ mntput(mnt); ++ return 0; ++ ++out: ++ up(&vz_quota_sem); ++ return err; ++} ++ ++/** ++ * vzquota_off - turn quota off ++ */ ++static int vzquota_sync_list(struct list_head *lh, ++ struct vz_quota_master *qmblk) ++{ ++ int err; ++ LIST_HEAD(list); ++ struct vz_quota_ilink *qlnk; ++ struct inode *inode; ++ struct writeback_control wbc; ++ ++ memset(&wbc, 0, sizeof(wbc)); ++ wbc.sync_mode = WB_SYNC_ALL; ++ ++ err = 0; ++ do { ++ inode = NULL; ++ list_for_each_entry (qlnk, lh, list) { ++ inode = igrab(QLNK_INODE(qlnk)); ++ if (inode) ++ break; ++ } ++ if (inode == NULL) ++ break; ++ ++ list_move(&qlnk->list, &list); ++ inode_qmblk_unlock(qmblk->dq_sb); ++ ++ wbc.nr_to_write = LONG_MAX; ++ err = sync_inode(inode, &wbc); ++ iput(inode); ++ ++ inode_qmblk_lock(qmblk->dq_sb); ++ } while (!err); ++ ++ list_splice(&list, lh); ++ return err; ++} ++ ++static int vzquota_sync_inodes(struct vz_quota_master *qmblk) ++{ ++ int err; ++ LIST_HEAD(qlnk_list); ++ ++ list_splice_init(&qmblk->dq_ilink_list, &qlnk_list); ++ err = vzquota_sync_list(&qlnk_list, qmblk); ++ if (!err && !list_empty(&qmblk->dq_ilink_list)) ++ err = -EBUSY; ++ list_splice(&qlnk_list, &qmblk->dq_ilink_list); ++ ++ return err; ++} ++ ++static int vzquota_off(unsigned int quota_id) ++{ ++ int err; ++ struct vz_quota_master *qmblk; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EALREADY; ++ if (qmblk->dq_state != VZDQ_WORKING) ++ goto out; ++ ++ inode_qmblk_lock(qmblk->dq_sb); /* protects dq_ilink_list also */ ++ err = vzquota_sync_inodes(qmblk); ++ if (err) ++ goto out_unlock; ++ inode_qmblk_unlock(qmblk->dq_sb); ++ ++ err = vzquota_off_qmblk(qmblk->dq_sb, qmblk); ++ if (err) ++ goto out; ++ ++ /* vzquota_destroy will free resources */ ++ qmblk->dq_state = VZDQ_STOPING; ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++ ++out_unlock: ++ inode_qmblk_unlock(qmblk->dq_sb); ++ goto out; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * Other VZQUOTA ioctl's. ++ * --------------------------------------------------------------------- */ ++ ++/* ++ * this function should: ++ * - set new limits/buffer under quota master block lock ++ * - if new softlimit less then usage, then set expiration time ++ * - no need to alloc ugid hash table - we'll do that on demand ++ */ ++int vzquota_update_limit(struct dq_stat *_qstat, ++ struct dq_stat *qstat) ++{ ++ int err; ++ ++ err = -EINVAL; ++ if (vzquota_check_sane_limits(qstat)) ++ goto out; ++ ++ err = 0; ++ ++ /* limits */ ++ _qstat->bsoftlimit = qstat->bsoftlimit; ++ _qstat->bhardlimit = qstat->bhardlimit; ++ /* ++ * If the soft limit is exceeded, administrator can override the moment ++ * when the grace period for limit exceeding ends. ++ * Specifying the moment may be useful if the soft limit is set to be ++ * lower than the current usage. In the latter case, if the grace ++ * period end isn't specified, the grace period will start from the ++ * moment of the first write operation. ++ * There is a race with the user level. Soft limit may be already ++ * exceeded before the limit change, and grace period end calculated by ++ * the kernel will be overriden. User level may check if the limit is ++ * already exceeded, but check and set calls are not atomic. ++ * This race isn't dangerous. Under normal cicrumstances, the ++ * difference between the grace period end calculated by the kernel and ++ * the user level should be not greater than as the difference between ++ * the moments of check and set calls, i.e. not bigger than the quota ++ * timer resolution - 1 sec. ++ */ ++ if (qstat->btime != (time_t)0 && ++ _qstat->bcurrent >= _qstat->bsoftlimit) ++ _qstat->btime = qstat->btime; ++ ++ _qstat->isoftlimit = qstat->isoftlimit; ++ _qstat->ihardlimit = qstat->ihardlimit; ++ if (qstat->itime != (time_t)0 && ++ _qstat->icurrent >= _qstat->isoftlimit) ++ _qstat->itime = qstat->itime; ++ ++out: ++ return err; ++} ++ ++/* ++ * set new quota limits. ++ * this function should: ++ * copy new limits from user level ++ * - find quota block ++ * - set new limits and flags. ++ */ ++static int vzquota_setlimit(unsigned int quota_id, ++ struct vz_quota_stat *u_qstat) ++{ ++ int err; ++ struct vz_quota_stat qstat; ++ struct vz_quota_master *qmblk; ++ ++ down(&vz_quota_sem); /* for hash list protection */ ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EFAULT; ++ if (copy_from_user(&qstat, u_qstat, sizeof(qstat))) ++ goto out; ++ ++ qmblk_data_write_lock(qmblk); ++ err = vzquota_update_limit(&qmblk->dq_stat, &qstat.dq_stat); ++ if (err == 0) ++ qmblk->dq_info = qstat.dq_info; ++ qmblk_data_write_unlock(qmblk); ++ ++out: ++ up(&vz_quota_sem); ++ return err; ++} ++ ++/* ++ * get quota limits. ++ * very simple - just return stat buffer to user ++ */ ++static int vzquota_getstat(unsigned int quota_id, ++ struct vz_quota_stat *u_qstat) ++{ ++ int err; ++ struct vz_quota_stat qstat; ++ struct vz_quota_master *qmblk; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ qmblk_data_read_lock(qmblk); ++ /* copy whole buffer under lock */ ++ memcpy(&qstat.dq_stat, &qmblk->dq_stat, sizeof(qstat.dq_stat)); ++ memcpy(&qstat.dq_info, &qmblk->dq_info, sizeof(qstat.dq_info)); ++ qmblk_data_read_unlock(qmblk); ++ ++ err = copy_to_user(u_qstat, &qstat, sizeof(qstat)); ++ if (err) ++ err = -EFAULT; ++ ++out: ++ up(&vz_quota_sem); ++ return err; ++} ++ ++/* ++ * This is a system call to turn per-VE disk quota on. ++ * Note this call is allowed to run ONLY from VE0 ++ */ ++long do_vzquotactl(int cmd, unsigned int quota_id, ++ struct vz_quota_stat *qstat, const char *ve_root) ++{ ++ int ret; ++ ++ ret = -EPERM; ++ /* access allowed only from root of VE0 */ ++ if (!capable(CAP_SYS_RESOURCE) || ++ !capable(CAP_SYS_ADMIN)) ++ goto out; ++ ++ switch (cmd) { ++ case VZ_DQ_CREATE: ++ ret = vzquota_create(quota_id, qstat); ++ break; ++ case VZ_DQ_DESTROY: ++ ret = vzquota_destroy(quota_id); ++ break; ++ case VZ_DQ_ON: ++ ret = vzquota_on(quota_id, ve_root); ++ break; ++ case VZ_DQ_OFF: ++ ret = vzquota_off(quota_id); ++ break; ++ case VZ_DQ_SETLIMIT: ++ ret = vzquota_setlimit(quota_id, qstat); ++ break; ++ case VZ_DQ_GETSTAT: ++ ret = vzquota_getstat(quota_id, qstat); ++ break; ++ ++ default: ++ ret = -EINVAL; ++ goto out; ++ } ++ ++out: ++ return ret; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * Proc filesystem routines ++ * ---------------------------------------------------------------------*/ ++ ++#if defined(CONFIG_PROC_FS) ++ ++#define QUOTA_UINT_LEN 15 ++#define QUOTA_TIME_LEN_FMT_UINT "%11u" ++#define QUOTA_NUM_LEN_FMT_UINT "%15u" ++#define QUOTA_NUM_LEN_FMT_ULL "%15Lu" ++#define QUOTA_TIME_LEN_FMT_STR "%11s" ++#define QUOTA_NUM_LEN_FMT_STR "%15s" ++#define QUOTA_PROC_MAX_LINE_LEN 2048 ++ ++/* ++ * prints /proc/ve_dq header line ++ */ ++static int print_proc_header(char * buffer) ++{ ++ return sprintf(buffer, ++ "%-11s" ++ QUOTA_NUM_LEN_FMT_STR ++ QUOTA_NUM_LEN_FMT_STR ++ QUOTA_NUM_LEN_FMT_STR ++ QUOTA_TIME_LEN_FMT_STR ++ QUOTA_TIME_LEN_FMT_STR ++ "\n", ++ "qid: path", ++ "usage", "softlimit", "hardlimit", "time", "expire"); ++} ++ ++/* ++ * prints proc master record id, dentry path ++ */ ++static int print_proc_master_id(char * buffer, char * path_buf, ++ struct vz_quota_master * qp) ++{ ++ char *path; ++ int over; ++ ++ path = NULL; ++ switch (qp->dq_state) { ++ case VZDQ_WORKING: ++ if (!path_buf) { ++ path = ""; ++ break; ++ } ++ path = d_path(qp->dq_root_dentry, ++ qp->dq_root_mnt, path_buf, PAGE_SIZE); ++ if (IS_ERR(path)) { ++ path = ""; ++ break; ++ } ++ /* do not print large path, truncate it */ ++ over = strlen(path) - ++ (QUOTA_PROC_MAX_LINE_LEN - 3 - 3 - ++ QUOTA_UINT_LEN); ++ if (over > 0) { ++ path += over - 3; ++ path[0] = path[1] = path[3] = '.'; ++ } ++ break; ++ case VZDQ_STARTING: ++ path = "-- started --"; ++ break; ++ case VZDQ_STOPING: ++ path = "-- stopped --"; ++ break; ++ } ++ ++ return sprintf(buffer, "%u: %s\n", qp->dq_id, path); ++} ++ ++/* ++ * prints struct vz_quota_stat data ++ */ ++static int print_proc_stat(char * buffer, struct dq_stat *qs, ++ struct dq_info *qi) ++{ ++ return sprintf(buffer, ++ "%11s" ++ QUOTA_NUM_LEN_FMT_ULL ++ QUOTA_NUM_LEN_FMT_ULL ++ QUOTA_NUM_LEN_FMT_ULL ++ QUOTA_TIME_LEN_FMT_UINT ++ QUOTA_TIME_LEN_FMT_UINT ++ "\n" ++ "%11s" ++ QUOTA_NUM_LEN_FMT_UINT ++ QUOTA_NUM_LEN_FMT_UINT ++ QUOTA_NUM_LEN_FMT_UINT ++ QUOTA_TIME_LEN_FMT_UINT ++ QUOTA_TIME_LEN_FMT_UINT ++ "\n", ++ "1k-blocks", ++ qs->bcurrent >> 10, ++ qs->bsoftlimit >> 10, ++ qs->bhardlimit >> 10, ++ (unsigned int)qs->btime, ++ (unsigned int)qi->bexpire, ++ "inodes", ++ qs->icurrent, ++ qs->isoftlimit, ++ qs->ihardlimit, ++ (unsigned int)qs->itime, ++ (unsigned int)qi->iexpire); ++} ++ ++ ++/* ++ * for /proc filesystem output ++ */ ++static int vzquota_read_proc(char *page, char **start, off_t off, int count, ++ int *eof, void *data) ++{ ++ int len, i; ++ off_t printed = 0; ++ char *p = page; ++ struct vz_quota_master *qp; ++ struct vz_quota_ilink *ql2; ++ struct list_head *listp; ++ char *path_buf; ++ ++ path_buf = (char*)__get_free_page(GFP_KERNEL); ++ if (path_buf == NULL) ++ return -ENOMEM; ++ ++ len = print_proc_header(p); ++ printed += len; ++ if (off < printed) /* keep header in output */ { ++ *start = p + off; ++ p += len; ++ } ++ ++ down(&vz_quota_sem); ++ ++ /* traverse master hash table for all records */ ++ for (i = 0; i < vzquota_hash_size; i++) { ++ list_for_each(listp, &vzquota_hash_table[i]) { ++ qp = list_entry(listp, ++ struct vz_quota_master, dq_hash); ++ ++ /* Skip other VE's information if not root of VE0 */ ++ if ((!capable(CAP_SYS_ADMIN) || ++ !capable(CAP_SYS_RESOURCE))) { ++ ql2 = INODE_QLNK(current->fs->root->d_inode); ++ if (ql2 == NULL || qp != ql2->qmblk) ++ continue; ++ } ++ /* ++ * Now print the next record ++ */ ++ len = 0; ++ /* we print quotaid and path only in VE0 */ ++ if (capable(CAP_SYS_ADMIN)) ++ len += print_proc_master_id(p+len,path_buf, qp); ++ len += print_proc_stat(p+len, &qp->dq_stat, ++ &qp->dq_info); ++ printed += len; ++ /* skip unnecessary lines */ ++ if (printed <= off) ++ continue; ++ p += len; ++ /* provide start offset */ ++ if (*start == NULL) ++ *start = p + (off - printed); ++ /* have we printed all requested size? */ ++ if (PAGE_SIZE - (p - page) < QUOTA_PROC_MAX_LINE_LEN || ++ (p - *start) >= count) ++ goto out; ++ } ++ } ++ ++ *eof = 1; /* checked all hash */ ++out: ++ up(&vz_quota_sem); ++ ++ len = 0; ++ if (*start != NULL) { ++ len = (p - *start); ++ if (len > count) ++ len = count; ++ } ++ ++ if (path_buf) ++ free_page((unsigned long) path_buf); ++ ++ return len; ++} ++ ++/* ++ * Register procfs read callback ++ */ ++int vzquota_proc_init(void) ++{ ++ struct proc_dir_entry *de; ++ ++ de = create_proc_entry("vz/vzquota", S_IFREG|S_IRUSR, NULL); ++ if (de == NULL) { ++ /* create "vz" subdirectory, if not exist */ ++ de = create_proc_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL); ++ if (de == NULL) ++ goto out_err; ++ de = create_proc_entry("vzquota", S_IFREG|S_IRUSR, de); ++ if (de == NULL) ++ goto out_err; ++ } ++ de->read_proc = vzquota_read_proc; ++ de->data = NULL; ++ return 0; ++out_err: ++ return -EBUSY; ++} ++ ++void vzquota_proc_release(void) ++{ ++ /* Unregister procfs read callback */ ++ remove_proc_entry("vz/vzquota", NULL); ++} ++ ++#endif +diff -uprN linux-2.6.15.orig/fs/vzdq_ops.c linux-2.6.15-ve025stab014/fs/vzdq_ops.c +--- linux-2.6.15.orig/fs/vzdq_ops.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/vzdq_ops.c 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,565 @@ ++/* ++ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ */ ++ ++#include <linux/config.h> ++#include <linux/kernel.h> ++#include <linux/types.h> ++#include <asm/semaphore.h> ++#include <linux/sched.h> ++#include <linux/fs.h> ++#include <linux/quota.h> ++#include <linux/vzquota.h> ++ ++ ++/* ---------------------------------------------------------------------- ++ * Quota superblock operations - helper functions. ++ * --------------------------------------------------------------------- */ ++ ++static inline void vzquota_incr_inodes(struct dq_stat *dqstat, ++ unsigned long number) ++{ ++ dqstat->icurrent += number; ++} ++ ++static inline void vzquota_incr_space(struct dq_stat *dqstat, ++ __u64 number) ++{ ++ dqstat->bcurrent += number; ++} ++ ++static inline void vzquota_decr_inodes(struct dq_stat *dqstat, ++ unsigned long number) ++{ ++ if (dqstat->icurrent > number) ++ dqstat->icurrent -= number; ++ else ++ dqstat->icurrent = 0; ++ if (dqstat->icurrent < dqstat->isoftlimit) ++ dqstat->itime = (time_t) 0; ++} ++ ++static inline void vzquota_decr_space(struct dq_stat *dqstat, ++ __u64 number) ++{ ++ if (dqstat->bcurrent > number) ++ dqstat->bcurrent -= number; ++ else ++ dqstat->bcurrent = 0; ++ if (dqstat->bcurrent < dqstat->bsoftlimit) ++ dqstat->btime = (time_t) 0; ++} ++ ++/* ++ * better printk() message or use /proc/vzquotamsg interface ++ * similar to /proc/kmsg ++ */ ++static inline void vzquota_warn(struct dq_info *dq_info, int dq_id, int flag, ++ const char *fmt) ++{ ++ if (dq_info->flags & flag) /* warning already printed for this ++ masterblock */ ++ return; ++ printk(fmt, dq_id); ++ dq_info->flags |= flag; ++} ++ ++/* ++ * ignore_hardlimit - ++ * ++ * Intended to allow superuser of VE0 to overwrite hardlimits. ++ * ++ * ignore_hardlimit() has a very bad feature: ++ * ++ * writepage() operation for writable mapping of a file with holes ++ * may trigger get_block() with wrong current and as a consequence, ++ * opens a possibility to overcommit hardlimits ++ */ ++/* for the reason above, it is disabled now */ ++static inline int ignore_hardlimit(struct dq_info *dqstat) ++{ ++#if 0 ++ return ve_is_super(get_exec_env()) && ++ capable(CAP_SYS_RESOURCE) && ++ (dqstat->options & VZ_QUOTA_OPT_RSQUASH); ++#else ++ return 0; ++#endif ++} ++ ++static int vzquota_check_inodes(struct dq_info *dq_info, ++ struct dq_stat *dqstat, ++ unsigned long number, int dq_id) ++{ ++ if (number == 0) ++ return QUOTA_OK; ++ ++ if (dqstat->icurrent + number > dqstat->ihardlimit && ++ !ignore_hardlimit(dq_info)) { ++ vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES, ++ "VZ QUOTA: file hardlimit reached for id=%d\n"); ++ return NO_QUOTA; ++ } ++ ++ if (dqstat->icurrent + number > dqstat->isoftlimit) { ++ if (dqstat->itime == (time_t)0) { ++ vzquota_warn(dq_info, dq_id, 0, ++ "VZ QUOTA: file softlimit exceeded " ++ "for id=%d\n"); ++ dqstat->itime = CURRENT_TIME_SEC.tv_sec + ++ dq_info->iexpire; ++ } else if (CURRENT_TIME_SEC.tv_sec >= dqstat->itime && ++ !ignore_hardlimit(dq_info)) { ++ vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES, ++ "VZ QUOTA: file softlimit expired " ++ "for id=%d\n"); ++ return NO_QUOTA; ++ } ++ } ++ ++ return QUOTA_OK; ++} ++ ++static int vzquota_check_space(struct dq_info *dq_info, ++ struct dq_stat *dqstat, ++ __u64 number, int dq_id, char prealloc) ++{ ++ if (number == 0) ++ return QUOTA_OK; ++ ++ if (dqstat->bcurrent + number > dqstat->bhardlimit && ++ !ignore_hardlimit(dq_info)) { ++ if (!prealloc) ++ vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE, ++ "VZ QUOTA: disk hardlimit reached " ++ "for id=%d\n"); ++ return NO_QUOTA; ++ } ++ ++ if (dqstat->bcurrent + number > dqstat->bsoftlimit) { ++ if (dqstat->btime == (time_t)0) { ++ if (!prealloc) { ++ vzquota_warn(dq_info, dq_id, 0, ++ "VZ QUOTA: disk softlimit exceeded " ++ "for id=%d\n"); ++ dqstat->btime = CURRENT_TIME_SEC.tv_sec ++ + dq_info->bexpire; ++ } else { ++ /* ++ * Original Linux quota doesn't allow ++ * preallocation to exceed softlimit so ++ * exceeding will be always printed ++ */ ++ return NO_QUOTA; ++ } ++ } else if (CURRENT_TIME_SEC.tv_sec >= dqstat->btime && ++ !ignore_hardlimit(dq_info)) { ++ if (!prealloc) ++ vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE, ++ "VZ QUOTA: disk quota " ++ "softlimit expired " ++ "for id=%d\n"); ++ return NO_QUOTA; ++ } ++ } ++ ++ return QUOTA_OK; ++} ++ ++static int vzquota_check_ugid_inodes(struct vz_quota_master *qmblk, ++ struct vz_quota_ugid *qugid[], ++ int type, unsigned long number) ++{ ++ struct dq_info *dqinfo; ++ struct dq_stat *dqstat; ++ ++ if (qugid[type] == NULL) ++ return QUOTA_OK; ++ if (qugid[type] == VZ_QUOTA_UGBAD) ++ return NO_QUOTA; ++ ++ if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA)) ++ return QUOTA_OK; ++ if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA)) ++ return QUOTA_OK; ++ if (number == 0) ++ return QUOTA_OK; ++ ++ dqinfo = &qmblk->dq_ugid_info[type]; ++ dqstat = &qugid[type]->qugid_stat; ++ ++ if (dqstat->ihardlimit != 0 && ++ dqstat->icurrent + number > dqstat->ihardlimit) ++ return NO_QUOTA; ++ ++ if (dqstat->isoftlimit != 0 && ++ dqstat->icurrent + number > dqstat->isoftlimit) { ++ if (dqstat->itime == (time_t)0) ++ dqstat->itime = CURRENT_TIME_SEC.tv_sec + ++ dqinfo->iexpire; ++ else if (CURRENT_TIME_SEC.tv_sec >= dqstat->itime) ++ return NO_QUOTA; ++ } ++ ++ return QUOTA_OK; ++} ++ ++static int vzquota_check_ugid_space(struct vz_quota_master *qmblk, ++ struct vz_quota_ugid *qugid[], ++ int type, __u64 number, char prealloc) ++{ ++ struct dq_info *dqinfo; ++ struct dq_stat *dqstat; ++ ++ if (qugid[type] == NULL) ++ return QUOTA_OK; ++ if (qugid[type] == VZ_QUOTA_UGBAD) ++ return NO_QUOTA; ++ ++ if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA)) ++ return QUOTA_OK; ++ if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA)) ++ return QUOTA_OK; ++ if (number == 0) ++ return QUOTA_OK; ++ ++ dqinfo = &qmblk->dq_ugid_info[type]; ++ dqstat = &qugid[type]->qugid_stat; ++ ++ if (dqstat->bhardlimit != 0 && ++ dqstat->bcurrent + number > dqstat->bhardlimit) ++ return NO_QUOTA; ++ ++ if (dqstat->bsoftlimit != 0 && ++ dqstat->bcurrent + number > dqstat->bsoftlimit) { ++ if (dqstat->btime == (time_t)0) { ++ if (!prealloc) ++ dqstat->btime = CURRENT_TIME_SEC.tv_sec ++ + dqinfo->bexpire; ++ else ++ /* ++ * Original Linux quota doesn't allow ++ * preallocation to exceed softlimit so ++ * exceeding will be always printed ++ */ ++ return NO_QUOTA; ++ } else if (CURRENT_TIME_SEC.tv_sec >= dqstat->btime) ++ return NO_QUOTA; ++ } ++ ++ return QUOTA_OK; ++} ++ ++/* ---------------------------------------------------------------------- ++ * Quota superblock operations ++ * --------------------------------------------------------------------- */ ++ ++/* ++ * S_NOQUOTA note. ++ * In the current kernel (2.6.8.1), S_NOQUOTA flag is set only for ++ * - quota file (absent in our case) ++ * - after explicit DQUOT_DROP (earlier than clear_inode) in functions like ++ * filesystem-specific new_inode, before the inode gets outside links. ++ * For the latter case, the only quota operation where care about S_NOQUOTA ++ * might be required is vzquota_drop, but there S_NOQUOTA has already been ++ * checked in DQUOT_DROP(). ++ * So, S_NOQUOTA may be ignored for now in the VZDQ code. ++ * ++ * The above note is not entirely correct. ++ * Both for ext2 and ext3 filesystems, DQUOT_FREE_INODE is called from ++ * delete_inode if new_inode fails (for example, because of inode quota ++ * limits), so S_NOQUOTA check is needed in free_inode. ++ * This seems to be the dark corner of the current quota API. ++ */ ++ ++/* ++ * Initialize quota operations for the specified inode. ++ */ ++static int vzquota_initialize(struct inode *inode, int type) ++{ ++ vzquota_inode_init_call(inode); ++ return 0; /* ignored by caller */ ++} ++ ++/* ++ * Release quota for the specified inode. ++ */ ++static int vzquota_drop(struct inode *inode) ++{ ++ vzquota_inode_drop_call(inode); ++ return 0; /* ignored by caller */ ++} ++ ++/* ++ * Allocate block callback. ++ * ++ * If (prealloc) disk quota exceeding warning is not printed. ++ * See Linux quota to know why. ++ * ++ * Return: ++ * QUOTA_OK == 0 on SUCCESS ++ * NO_QUOTA == 1 if allocation should fail ++ */ ++static int vzquota_alloc_space(struct inode *inode, ++ qsize_t number, int prealloc) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ int ret = QUOTA_OK; ++ ++ qmblk = vzquota_inode_data(inode, &data); ++ if (qmblk == VZ_QUOTA_BAD) ++ return NO_QUOTA; ++ if (qmblk != NULL) { ++#ifdef CONFIG_VZ_QUOTA_UGID ++ int cnt; ++ struct vz_quota_ugid * qugid[MAXQUOTAS]; ++#endif ++ ++ /* checking first */ ++ ret = vzquota_check_space(&qmblk->dq_info, &qmblk->dq_stat, ++ number, qmblk->dq_id, prealloc); ++ if (ret == NO_QUOTA) ++ goto no_quota; ++#ifdef CONFIG_VZ_QUOTA_UGID ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ qugid[cnt] = INODE_QLNK(inode)->qugid[cnt]; ++ ret = vzquota_check_ugid_space(qmblk, qugid, ++ cnt, number, prealloc); ++ if (ret == NO_QUOTA) ++ goto no_quota; ++ } ++ /* check ok, may increment */ ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ if (qugid[cnt] == NULL) ++ continue; ++ vzquota_incr_space(&qugid[cnt]->qugid_stat, number); ++ } ++#endif ++ vzquota_incr_space(&qmblk->dq_stat, number); ++ vzquota_data_unlock(inode, &data); ++ } ++ ++ inode_add_bytes(inode, number); ++ might_sleep(); ++ return QUOTA_OK; ++ ++no_quota: ++ vzquota_data_unlock(inode, &data); ++ return NO_QUOTA; ++} ++ ++/* ++ * Allocate inodes callback. ++ * ++ * Return: ++ * QUOTA_OK == 0 on SUCCESS ++ * NO_QUOTA == 1 if allocation should fail ++ */ ++static int vzquota_alloc_inode(const struct inode *inode, unsigned long number) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ int ret = QUOTA_OK; ++ ++ qmblk = vzquota_inode_data((struct inode *)inode, &data); ++ if (qmblk == VZ_QUOTA_BAD) ++ return NO_QUOTA; ++ if (qmblk != NULL) { ++#ifdef CONFIG_VZ_QUOTA_UGID ++ int cnt; ++ struct vz_quota_ugid *qugid[MAXQUOTAS]; ++#endif ++ ++ /* checking first */ ++ ret = vzquota_check_inodes(&qmblk->dq_info, &qmblk->dq_stat, ++ number, qmblk->dq_id); ++ if (ret == NO_QUOTA) ++ goto no_quota; ++#ifdef CONFIG_VZ_QUOTA_UGID ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ qugid[cnt] = INODE_QLNK(inode)->qugid[cnt]; ++ ret = vzquota_check_ugid_inodes(qmblk, qugid, ++ cnt, number); ++ if (ret == NO_QUOTA) ++ goto no_quota; ++ } ++ /* check ok, may increment */ ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ if (qugid[cnt] == NULL) ++ continue; ++ vzquota_incr_inodes(&qugid[cnt]->qugid_stat, number); ++ } ++#endif ++ vzquota_incr_inodes(&qmblk->dq_stat, number); ++ vzquota_data_unlock((struct inode *)inode, &data); ++ } ++ ++ might_sleep(); ++ return QUOTA_OK; ++ ++no_quota: ++ vzquota_data_unlock((struct inode *)inode, &data); ++ return NO_QUOTA; ++} ++ ++/* ++ * Free space callback. ++ */ ++static int vzquota_free_space(struct inode *inode, qsize_t number) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ ++ qmblk = vzquota_inode_data(inode, &data); ++ if (qmblk == VZ_QUOTA_BAD) ++ return NO_QUOTA; /* isn't checked by the caller */ ++ if (qmblk != NULL) { ++#ifdef CONFIG_VZ_QUOTA_UGID ++ int cnt; ++ struct vz_quota_ugid * qugid; ++#endif ++ ++ vzquota_decr_space(&qmblk->dq_stat, number); ++#ifdef CONFIG_VZ_QUOTA_UGID ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ qugid = INODE_QLNK(inode)->qugid[cnt]; ++ if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) ++ continue; ++ vzquota_decr_space(&qugid->qugid_stat, number); ++ } ++#endif ++ vzquota_data_unlock(inode, &data); ++ } ++ inode_sub_bytes(inode, number); ++ might_sleep(); ++ return QUOTA_OK; ++} ++ ++/* ++ * Free inodes callback. ++ */ ++static int vzquota_free_inode(const struct inode *inode, unsigned long number) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ ++ if (IS_NOQUOTA(inode)) ++ return QUOTA_OK; ++ ++ qmblk = vzquota_inode_data((struct inode *)inode, &data); ++ if (qmblk == VZ_QUOTA_BAD) ++ return NO_QUOTA; ++ if (qmblk != NULL) { ++#ifdef CONFIG_VZ_QUOTA_UGID ++ int cnt; ++ struct vz_quota_ugid * qugid; ++#endif ++ ++ vzquota_decr_inodes(&qmblk->dq_stat, number); ++#ifdef CONFIG_VZ_QUOTA_UGID ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ qugid = INODE_QLNK(inode)->qugid[cnt]; ++ if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) ++ continue; ++ vzquota_decr_inodes(&qugid->qugid_stat, number); ++ } ++#endif ++ vzquota_data_unlock((struct inode *)inode, &data); ++ } ++ might_sleep(); ++ return QUOTA_OK; ++} ++ ++#if defined(CONFIG_VZ_QUOTA_UGID) ++ ++/* ++ * helper function for quota_transfer ++ * check that we can add inode to this quota_id ++ */ ++static int vzquota_transfer_check(struct vz_quota_master *qmblk, ++ struct vz_quota_ugid *qugid[], ++ unsigned int type, __u64 size) ++{ ++ if (vzquota_check_ugid_space(qmblk, qugid, type, size, 0) != QUOTA_OK || ++ vzquota_check_ugid_inodes(qmblk, qugid, type, 1) != QUOTA_OK) ++ return -1; ++ return 0; ++} ++ ++int vzquota_transfer_usage(struct inode *inode, ++ int mask, ++ struct vz_quota_ilink *qlnk) ++{ ++ struct vz_quota_ugid *qugid_old; ++ __u64 space; ++ int i; ++ ++ space = inode_get_bytes(inode); ++ for (i = 0; i < MAXQUOTAS; i++) { ++ if (!(mask & (1 << i))) ++ continue; ++ if (vzquota_transfer_check(qlnk->qmblk, qlnk->qugid, i, space)) ++ return -1; ++ } ++ ++ for (i = 0; i < MAXQUOTAS; i++) { ++ if (!(mask & (1 << i))) ++ continue; ++ qugid_old = INODE_QLNK(inode)->qugid[i]; ++ vzquota_decr_space(&qugid_old->qugid_stat, space); ++ vzquota_decr_inodes(&qugid_old->qugid_stat, 1); ++ vzquota_incr_space(&qlnk->qugid[i]->qugid_stat, space); ++ vzquota_incr_inodes(&qlnk->qugid[i]->qugid_stat, 1); ++ } ++ return 0; ++} ++ ++/* ++ * Transfer the inode between diffent user/group quotas. ++ */ ++static int vzquota_transfer(struct inode *inode, struct iattr *iattr) ++{ ++ return vzquota_inode_transfer_call(inode, iattr) ? ++ NO_QUOTA : QUOTA_OK; ++} ++ ++#else /* CONFIG_VZ_QUOTA_UGID */ ++ ++static int vzquota_transfer(struct inode *inode, struct iattr *iattr) ++{ ++ return QUOTA_OK; ++} ++ ++#endif ++ ++/* ++ * Called under following semaphores: ++ * old_d->d_inode->i_sb->s_vfs_rename_sem ++ * old_d->d_inode->i_sem ++ * new_d->d_inode->i_sem ++ * [not verified --SAW] ++ */ ++static int vzquota_rename(struct inode *inode, ++ struct inode *old_dir, struct inode *new_dir) ++{ ++ return vzquota_rename_check(inode, old_dir, new_dir) ? ++ NO_QUOTA : QUOTA_OK; ++} ++ ++/* ++ * Structure of superblock diskquota operations. ++ */ ++struct dquot_operations vz_quota_operations = { ++ initialize: vzquota_initialize, ++ drop: vzquota_drop, ++ alloc_space: vzquota_alloc_space, ++ alloc_inode: vzquota_alloc_inode, ++ free_space: vzquota_free_space, ++ free_inode: vzquota_free_inode, ++ transfer: vzquota_transfer, ++ rename: vzquota_rename ++}; +diff -uprN linux-2.6.15.orig/fs/vzdq_tree.c linux-2.6.15-ve025stab014/fs/vzdq_tree.c +--- linux-2.6.15.orig/fs/vzdq_tree.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/vzdq_tree.c 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,286 @@ ++/* ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains Virtuozzo quota tree implementation ++ */ ++ ++#include <linux/errno.h> ++#include <linux/slab.h> ++#include <linux/vzdq_tree.h> ++ ++struct quotatree_tree *quotatree_alloc(void) ++{ ++ int l; ++ struct quotatree_tree *tree; ++ ++ tree = kmalloc(sizeof(struct quotatree_tree), GFP_KERNEL); ++ if (tree == NULL) ++ goto out; ++ ++ for (l = 0; l < QUOTATREE_DEPTH; l++) { ++ INIT_LIST_HEAD(&tree->levels[l].usedlh); ++ INIT_LIST_HEAD(&tree->levels[l].freelh); ++ tree->levels[l].freenum = 0; ++ } ++ tree->root = NULL; ++ tree->leaf_num = 0; ++out: ++ return tree; ++} ++ ++static struct quotatree_node * ++quotatree_follow(struct quotatree_tree *tree, quotaid_t id, int level, ++ struct quotatree_find_state *st) ++{ ++ void **block; ++ struct quotatree_node *parent; ++ int l, index; ++ ++ parent = NULL; ++ block = (void **)&tree->root; ++ l = 0; ++ while (l < level && *block != NULL) { ++ index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK; ++ parent = *block; ++ block = parent->blocks + index; ++ l++; ++ } ++ if (st != NULL) { ++ st->block = block; ++ st->level = l; ++ } ++ ++ return parent; ++} ++ ++void *quotatree_find(struct quotatree_tree *tree, quotaid_t id, ++ struct quotatree_find_state *st) ++{ ++ quotatree_follow(tree, id, QUOTATREE_DEPTH, st); ++ if (st->level == QUOTATREE_DEPTH) ++ return *st->block; ++ else ++ return NULL; ++} ++ ++void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index) ++{ ++ int i, count; ++ struct quotatree_node *p; ++ void *leaf; ++ ++ if (QTREE_LEAFNUM(tree) <= index) ++ return NULL; ++ ++ count = 0; ++ list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) { ++ for (i = 0; i < QUOTATREE_BSIZE; i++) { ++ leaf = p->blocks[i]; ++ if (leaf == NULL) ++ continue; ++ if (count == index) ++ return leaf; ++ count++; ++ } ++ } ++ return NULL; ++} ++ ++/* returns data leaf (vz_quota_ugid) after _existent_ ugid (@id) ++ * in the tree... */ ++void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id) ++{ ++ int off; ++ struct quotatree_node *parent, *p; ++ struct list_head *lh; ++ ++ /* get parent refering correct quota tree node of the last level */ ++ parent = quotatree_follow(tree, id, QUOTATREE_DEPTH, NULL); ++ if (!parent) ++ return NULL; ++ ++ off = (id & QUOTATREE_BMASK) + 1; /* next ugid */ ++ lh = &parent->list; ++ do { ++ p = list_entry(lh, struct quotatree_node, list); ++ for ( ; off < QUOTATREE_BSIZE; off++) ++ if (p->blocks[off]) ++ return p->blocks[off]; ++ off = 0; ++ lh = lh->next; ++ } while (lh != &QTREE_LEAFLVL(tree)->usedlh); ++ ++ return NULL; ++} ++ ++int quotatree_insert(struct quotatree_tree *tree, quotaid_t id, ++ struct quotatree_find_state *st, void *data) ++{ ++ struct quotatree_node *p; ++ int l, index; ++ ++ while (st->level < QUOTATREE_DEPTH) { ++ l = st->level; ++ if (!list_empty(&tree->levels[l].freelh)) { ++ p = list_entry(tree->levels[l].freelh.next, ++ struct quotatree_node, list); ++ list_del(&p->list); ++ } else { ++ p = kmalloc(sizeof(struct quotatree_node), GFP_KERNEL); ++ if (p == NULL) ++ return -ENOMEM; ++ /* save block number in the l-level ++ * it uses for quota file generation */ ++ p->num = tree->levels[l].freenum++; ++ } ++ list_add(&p->list, &tree->levels[l].usedlh); ++ memset(p->blocks, 0, sizeof(p->blocks)); ++ *st->block = p; ++ ++ index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK; ++ st->block = p->blocks + index; ++ st->level++; ++ } ++ tree->leaf_num++; ++ *st->block = data; ++ ++ return 0; ++} ++ ++static struct quotatree_node * ++quotatree_remove_ptr(struct quotatree_tree *tree, quotaid_t id, ++ int level) ++{ ++ struct quotatree_node *parent; ++ struct quotatree_find_state st; ++ ++ parent = quotatree_follow(tree, id, level, &st); ++ if (st.level == QUOTATREE_DEPTH) ++ tree->leaf_num--; ++ *st.block = NULL; ++ return parent; ++} ++ ++void quotatree_remove(struct quotatree_tree *tree, quotaid_t id) ++{ ++ struct quotatree_node *p; ++ int level, i; ++ ++ p = quotatree_remove_ptr(tree, id, QUOTATREE_DEPTH); ++ for (level = QUOTATREE_DEPTH - 1; level >= QUOTATREE_CDEPTH; level--) { ++ for (i = 0; i < QUOTATREE_BSIZE; i++) ++ if (p->blocks[i] != NULL) ++ return; ++ list_move(&p->list, &tree->levels[level].freelh); ++ p = quotatree_remove_ptr(tree, id, level); ++ } ++} ++ ++#if 0 ++static void quotatree_walk(struct quotatree_tree *tree, ++ struct quotatree_node *node_start, ++ quotaid_t id_start, ++ int level_start, int level_end, ++ int (*callback)(struct quotatree_tree *, ++ quotaid_t id, ++ int level, ++ void *ptr, ++ void *data), ++ void *data) ++{ ++ struct quotatree_node *p; ++ int l, shift, index; ++ quotaid_t id; ++ struct quotatree_find_state st; ++ ++ p = node_start; ++ l = level_start; ++ shift = (QUOTATREE_DEPTH - l) * QUOTAID_BBITS; ++ id = id_start; ++ index = 0; ++ ++ /* ++ * Invariants: ++ * shift == (QUOTATREE_DEPTH - l) * QUOTAID_BBITS; ++ * id & ((1 << shift) - 1) == 0 ++ * p is l-level node corresponding to id ++ */ ++ do { ++ if (!p) ++ break; ++ ++ if (l < level_end) { ++ for (; index < QUOTATREE_BSIZE; index++) ++ if (p->blocks[index] != NULL) ++ break; ++ if (index < QUOTATREE_BSIZE) { ++ /* descend */ ++ p = p->blocks[index]; ++ l++; ++ shift -= QUOTAID_BBITS; ++ id += (quotaid_t)index << shift; ++ index = 0; ++ continue; ++ } ++ } ++ ++ if ((*callback)(tree, id, l, p, data)) ++ break; ++ ++ /* ascend and to the next node */ ++ p = quotatree_follow(tree, id, l, &st); ++ ++ index = ((id >> shift) & QUOTATREE_BMASK) + 1; ++ l--; ++ shift += QUOTAID_BBITS; ++ id &= ~(((quotaid_t)1 << shift) - 1); ++ } while (l >= level_start); ++} ++#endif ++ ++static void free_list(struct list_head *node_list) ++{ ++ struct quotatree_node *p, *tmp; ++ ++ list_for_each_entry_safe(p, tmp, node_list, list) { ++ list_del(&p->list); ++ kfree(p); ++ } ++} ++ ++static inline void quotatree_free_nodes(struct quotatree_tree *tree) ++{ ++ int i; ++ ++ for (i = 0; i < QUOTATREE_DEPTH; i++) { ++ free_list(&tree->levels[i].usedlh); ++ free_list(&tree->levels[i].freelh); ++ } ++} ++ ++static void quotatree_free_leafs(struct quotatree_tree *tree, ++ void (*dtor)(void *)) ++{ ++ int i; ++ struct quotatree_node *p; ++ ++ list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) { ++ for (i = 0; i < QUOTATREE_BSIZE; i++) { ++ if (p->blocks[i] == NULL) ++ continue; ++ ++ dtor(p->blocks[i]); ++ } ++ } ++} ++ ++void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *)) ++{ ++ quotatree_free_leafs(tree, dtor); ++ quotatree_free_nodes(tree); ++ kfree(tree); ++} +diff -uprN linux-2.6.15.orig/fs/vzdq_ugid.c linux-2.6.15-ve025stab014/fs/vzdq_ugid.c +--- linux-2.6.15.orig/fs/vzdq_ugid.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/vzdq_ugid.c 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,1116 @@ ++/* ++ * Copyright (C) 2002 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains Virtuozzo UID/GID disk quota implementation ++ */ ++ ++#include <linux/config.h> ++#include <linux/string.h> ++#include <linux/slab.h> ++#include <linux/list.h> ++#include <linux/smp_lock.h> ++#include <linux/rcupdate.h> ++#include <asm/uaccess.h> ++#include <linux/proc_fs.h> ++#include <linux/init.h> ++#include <linux/module.h> ++#include <linux/quota.h> ++#include <linux/quotaio_v2.h> ++#include <linux/virtinfo.h> ++ ++#include <linux/vzctl.h> ++#include <linux/vzctl_quota.h> ++#include <linux/vzquota.h> ++ ++/* ++ * XXX ++ * may be something is needed for sb->s_dquot->info[]? ++ */ ++ ++#define USRQUOTA_MASK (1 << USRQUOTA) ++#define GRPQUOTA_MASK (1 << GRPQUOTA) ++#define QTYPE2MASK(type) (1 << (type)) ++ ++static kmem_cache_t *vz_quota_ugid_cachep; ++ ++/* guard to protect vz_quota_master from destroy in quota_on/off. Also protects ++ * list on the hash table */ ++extern struct semaphore vz_quota_sem; ++ ++inline struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid) ++{ ++ if (qugid != VZ_QUOTA_UGBAD) ++ atomic_inc(&qugid->qugid_count); ++ return qugid; ++} ++ ++/* we don't limit users with zero limits */ ++static inline int vzquota_fake_stat(struct dq_stat *stat) ++{ ++ return stat->bhardlimit == 0 && stat->bsoftlimit == 0 && ++ stat->ihardlimit == 0 && stat->isoftlimit == 0; ++} ++ ++/* callback function for quotatree_free() */ ++static inline void vzquota_free_qugid(void *ptr) ++{ ++ kmem_cache_free(vz_quota_ugid_cachep, ptr); ++} ++ ++/* ++ * destroy ugid, if it have zero refcount, limits and usage ++ * must be called under qmblk->dq_sem ++ */ ++void vzquota_put_ugid(struct vz_quota_master *qmblk, ++ struct vz_quota_ugid *qugid) ++{ ++ if (qugid == VZ_QUOTA_UGBAD) ++ return; ++ qmblk_data_read_lock(qmblk); ++ if (atomic_dec_and_test(&qugid->qugid_count) && ++ (qmblk->dq_flags & VZDQUG_FIXED_SET) == 0 && ++ vzquota_fake_stat(&qugid->qugid_stat) && ++ qugid->qugid_stat.bcurrent == 0 && ++ qugid->qugid_stat.icurrent == 0) { ++ quotatree_remove(QUGID_TREE(qmblk, qugid->qugid_type), ++ qugid->qugid_id); ++ qmblk->dq_ugid_count--; ++ vzquota_free_qugid(qugid); ++ } ++ qmblk_data_read_unlock(qmblk); ++} ++ ++/* ++ * Get ugid block by its index, like it would present in array. ++ * In reality, this is not array - this is leafs chain of the tree. ++ * NULL if index is out of range. ++ * qmblk semaphore is required to protect the tree. ++ */ ++static inline struct vz_quota_ugid * ++vzquota_get_byindex(struct vz_quota_master *qmblk, unsigned int index, int type) ++{ ++ return quotatree_leaf_byindex(QUGID_TREE(qmblk, type), index); ++} ++ ++/* ++ * get next element from ugid "virtual array" ++ * ugid must be in current array and this array may not be changed between ++ * two accesses (quaranteed by "stopped" quota state and quota semaphore) ++ * qmblk semaphore is required to protect the tree ++ */ ++static inline struct vz_quota_ugid * ++vzquota_get_next(struct vz_quota_master *qmblk, struct vz_quota_ugid *qugid) ++{ ++ return quotatree_get_next(QUGID_TREE(qmblk, qugid->qugid_type), ++ qugid->qugid_id); ++} ++ ++/* ++ * requires dq_sem ++ */ ++struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk, ++ unsigned int quota_id, int type, int flags) ++{ ++ struct vz_quota_ugid *qugid; ++ struct quotatree_tree *tree; ++ struct quotatree_find_state st; ++ ++ tree = QUGID_TREE(qmblk, type); ++ qugid = quotatree_find(tree, quota_id, &st); ++ if (qugid) ++ goto success; ++ ++ /* caller does not want alloc */ ++ if (flags & VZDQUG_FIND_DONT_ALLOC) ++ goto fail; ++ ++ if (flags & VZDQUG_FIND_FAKE) ++ goto doit; ++ ++ /* check limit */ ++ if (qmblk->dq_ugid_count >= qmblk->dq_ugid_max) ++ goto fail; ++ ++ /* see comment at VZDQUG_FIXED_SET define */ ++ if (qmblk->dq_flags & VZDQUG_FIXED_SET) ++ goto fail; ++ ++doit: ++ /* alloc new structure */ ++ qugid = kmem_cache_alloc(vz_quota_ugid_cachep, ++ SLAB_NOFS | __GFP_NOFAIL); ++ if (qugid == NULL) ++ goto fail; ++ ++ /* initialize new structure */ ++ qugid->qugid_id = quota_id; ++ memset(&qugid->qugid_stat, 0, sizeof(qugid->qugid_stat)); ++ qugid->qugid_type = type; ++ atomic_set(&qugid->qugid_count, 0); ++ ++ /* insert in tree */ ++ if (quotatree_insert(tree, quota_id, &st, qugid) < 0) ++ goto fail_insert; ++ qmblk->dq_ugid_count++; ++ ++success: ++ vzquota_get_ugid(qugid); ++ return qugid; ++ ++fail_insert: ++ vzquota_free_qugid(qugid); ++fail: ++ return VZ_QUOTA_UGBAD; ++} ++ ++/* ++ * takes dq_sem, may schedule ++ */ ++struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk, ++ unsigned int quota_id, int type, int flags) ++{ ++ struct vz_quota_ugid *qugid; ++ ++ down(&qmblk->dq_sem); ++ qugid = __vzquota_find_ugid(qmblk, quota_id, type, flags); ++ up(&qmblk->dq_sem); ++ ++ return qugid; ++} ++ ++/* ++ * destroy all ugid records on given quota master ++ */ ++void vzquota_kill_ugid(struct vz_quota_master *qmblk) ++{ ++ BUG_ON((qmblk->dq_gid_tree == NULL && qmblk->dq_uid_tree != NULL) || ++ (qmblk->dq_uid_tree == NULL && qmblk->dq_gid_tree != NULL)); ++ ++ if (qmblk->dq_uid_tree != NULL) { ++ quotatree_free(qmblk->dq_uid_tree, vzquota_free_qugid); ++ quotatree_free(qmblk->dq_gid_tree, vzquota_free_qugid); ++ } ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * Management interface to ugid quota for (super)users. ++ * --------------------------------------------------------------------- */ ++ ++/** ++ * vzquota_find_qmblk - helper to emulate quota on virtual filesystems ++ * ++ * This function finds a quota master block corresponding to the root of ++ * a virtual filesystem. ++ * Returns a quota master block with reference taken, or %NULL if not under ++ * quota, or %VZ_QUOTA_BAD if quota inconsistency is found (and all allocation ++ * operations will fail). ++ * ++ * Note: this function uses vzquota_inode_qmblk(). ++ * The latter is a rather confusing function: it returns qmblk that used to be ++ * on the inode some time ago (without guarantee that it still has any ++ * relations to the inode). So, vzquota_find_qmblk() leaves it up to the ++ * caller to think whether the inode could have changed its qmblk and what to ++ * do in that case. ++ * Currently, the callers appear to not care :( ++ */ ++struct vz_quota_master *vzquota_find_qmblk(struct super_block *sb) ++{ ++ struct inode *qrinode; ++ struct vz_quota_master *qmblk; ++ ++ qmblk = NULL; ++ qrinode = NULL; ++ if (sb->s_op->get_quota_root != NULL) ++ qrinode = sb->s_op->get_quota_root(sb); ++ if (qrinode != NULL) ++ qmblk = vzquota_inode_qmblk(qrinode); ++ return qmblk; ++} ++ ++static int vzquota_initialize2(struct inode *inode, int type) ++{ ++ return QUOTA_OK; ++} ++ ++static int vzquota_drop2(struct inode *inode) ++{ ++ return QUOTA_OK; ++} ++ ++static int vzquota_alloc_space2(struct inode *inode, ++ qsize_t number, int prealloc) ++{ ++ inode_add_bytes(inode, number); ++ return QUOTA_OK; ++} ++ ++static int vzquota_alloc_inode2(const struct inode *inode, unsigned long number) ++{ ++ return QUOTA_OK; ++} ++ ++static int vzquota_free_space2(struct inode *inode, qsize_t number) ++{ ++ inode_sub_bytes(inode, number); ++ return QUOTA_OK; ++} ++ ++static int vzquota_free_inode2(const struct inode *inode, unsigned long number) ++{ ++ return QUOTA_OK; ++} ++ ++static int vzquota_transfer2(struct inode *inode, struct iattr *iattr) ++{ ++ return QUOTA_OK; ++} ++ ++struct dquot_operations vz_quota_operations2 = { ++ initialize: vzquota_initialize2, ++ drop: vzquota_drop2, ++ alloc_space: vzquota_alloc_space2, ++ alloc_inode: vzquota_alloc_inode2, ++ free_space: vzquota_free_space2, ++ free_inode: vzquota_free_inode2, ++ transfer: vzquota_transfer2 ++}; ++ ++static int vz_quota_on(struct super_block *sb, int type, ++ int format_id, char *path) ++{ ++ struct vz_quota_master *qmblk; ++ int mask, mask2; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ ++ mask = 0; ++ mask2 = 0; ++ sb->dq_op = &vz_quota_operations2; ++ sb->s_qcop = &vz_quotactl_operations; ++ if (type == USRQUOTA) { ++ mask = DQUOT_USR_ENABLED; ++ mask2 = VZDQ_USRQUOTA; ++ } ++ if (type == GRPQUOTA) { ++ mask = DQUOT_GRP_ENABLED; ++ mask2 = VZDQ_GRPQUOTA; ++ } ++ err = -EBUSY; ++ if (qmblk->dq_flags & mask2) ++ goto out; ++ ++ err = 0; ++ qmblk->dq_flags |= mask2; ++ sb->s_dquot.flags |= mask; ++ ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ return err; ++} ++ ++static int vz_quota_off(struct super_block *sb, int type) ++{ ++ struct vz_quota_master *qmblk; ++ int mask2; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ ++ mask2 = 0; ++ if (type == USRQUOTA) ++ mask2 = VZDQ_USRQUOTA; ++ if (type == GRPQUOTA) ++ mask2 = VZDQ_GRPQUOTA; ++ err = -EINVAL; ++ if (!(qmblk->dq_flags & mask2)) ++ goto out; ++ ++ qmblk->dq_flags &= ~mask2; ++ err = 0; ++ ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ return err; ++} ++ ++static int vz_quota_sync(struct super_block *sb, int type) ++{ ++ return 0; /* vz quota is always uptodate */ ++} ++ ++static int vz_get_dqblk(struct super_block *sb, int type, ++ qid_t id, struct if_dqblk *di) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid *ugid; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ ++ err = 0; ++ ugid = vzquota_find_ugid(qmblk, id, type, VZDQUG_FIND_DONT_ALLOC); ++ if (ugid != VZ_QUOTA_UGBAD) { ++ qmblk_data_read_lock(qmblk); ++ di->dqb_bhardlimit = ugid->qugid_stat.bhardlimit >> 10; ++ di->dqb_bsoftlimit = ugid->qugid_stat.bsoftlimit >> 10; ++ di->dqb_curspace = ugid->qugid_stat.bcurrent; ++ di->dqb_ihardlimit = ugid->qugid_stat.ihardlimit; ++ di->dqb_isoftlimit = ugid->qugid_stat.isoftlimit; ++ di->dqb_curinodes = ugid->qugid_stat.icurrent; ++ di->dqb_btime = ugid->qugid_stat.btime; ++ di->dqb_itime = ugid->qugid_stat.itime; ++ qmblk_data_read_unlock(qmblk); ++ di->dqb_valid = QIF_ALL; ++ vzquota_put_ugid(qmblk, ugid); ++ } else { ++ memset(di, 0, sizeof(*di)); ++ di->dqb_valid = QIF_ALL; ++ } ++ ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ return err; ++} ++ ++/* must be called under vz_quota_sem */ ++static int __vz_set_dqblk(struct vz_quota_master *qmblk, ++ int type, qid_t id, struct if_dqblk *di) ++{ ++ struct vz_quota_ugid *ugid; ++ ++ ugid = vzquota_find_ugid(qmblk, id, type, 0); ++ if (ugid == VZ_QUOTA_UGBAD) ++ return -ESRCH; ++ ++ qmblk_data_write_lock(qmblk); ++ /* ++ * Subtle compatibility breakage. ++ * ++ * Some old non-vz kernel quota didn't start grace period ++ * if the new soft limit happens to be below the usage. ++ * Non-vz kernel quota in 2.4.20 starts the grace period ++ * (if it hasn't been started). ++ * Current non-vz kernel performs even more complicated ++ * manipulations... ++ * ++ * Also, current non-vz kernels have inconsistency related to ++ * the grace time start. In regular operations the grace period ++ * is started if the usage is greater than the soft limit (and, ++ * strangely, is cancelled if the usage is less). ++ * However, set_dqblk starts the grace period if the usage is greater ++ * or equal to the soft limit. ++ * ++ * Here we try to mimic the behavior of the current non-vz kernel. ++ */ ++ if (di->dqb_valid & QIF_BLIMITS) { ++ ugid->qugid_stat.bhardlimit = ++ (__u64)di->dqb_bhardlimit << 10; ++ ugid->qugid_stat.bsoftlimit = ++ (__u64)di->dqb_bsoftlimit << 10; ++ if (di->dqb_bsoftlimit == 0 || ++ ugid->qugid_stat.bcurrent < ugid->qugid_stat.bsoftlimit) ++ ugid->qugid_stat.btime = 0; ++ else if (!(di->dqb_valid & QIF_BTIME)) ++ ugid->qugid_stat.btime = CURRENT_TIME_SEC.tv_sec ++ + qmblk->dq_ugid_info[type].bexpire; ++ else ++ ugid->qugid_stat.btime = di->dqb_btime; ++ } ++ if (di->dqb_valid & QIF_ILIMITS) { ++ ugid->qugid_stat.ihardlimit = di->dqb_ihardlimit; ++ ugid->qugid_stat.isoftlimit = di->dqb_isoftlimit; ++ if (di->dqb_isoftlimit == 0 || ++ ugid->qugid_stat.icurrent < ugid->qugid_stat.isoftlimit) ++ ugid->qugid_stat.itime = 0; ++ else if (!(di->dqb_valid & QIF_ITIME)) ++ ugid->qugid_stat.itime = CURRENT_TIME_SEC.tv_sec ++ + qmblk->dq_ugid_info[type].iexpire; ++ else ++ ugid->qugid_stat.itime = di->dqb_itime; ++ } ++ qmblk_data_write_unlock(qmblk); ++ vzquota_put_ugid(qmblk, ugid); ++ ++ return 0; ++} ++ ++static int vz_set_dqblk(struct super_block *sb, int type, ++ qid_t id, struct if_dqblk *di) ++{ ++ struct vz_quota_master *qmblk; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ err = __vz_set_dqblk(qmblk, type, id, di); ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ return err; ++} ++ ++static int vz_get_dqinfo(struct super_block *sb, int type, ++ struct if_dqinfo *ii) ++{ ++ struct vz_quota_master *qmblk; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ ++ err = 0; ++ ii->dqi_bgrace = qmblk->dq_ugid_info[type].bexpire; ++ ii->dqi_igrace = qmblk->dq_ugid_info[type].iexpire; ++ ii->dqi_flags = 0; ++ ii->dqi_valid = IIF_ALL; ++ ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ return err; ++} ++ ++/* must be called under vz_quota_sem */ ++static int __vz_set_dqinfo(struct vz_quota_master *qmblk, ++ int type, struct if_dqinfo *ii) ++{ ++ if (ii->dqi_valid & IIF_FLAGS) ++ if (ii->dqi_flags & DQF_MASK) ++ return -EINVAL; ++ ++ if (ii->dqi_valid & IIF_BGRACE) ++ qmblk->dq_ugid_info[type].bexpire = ii->dqi_bgrace; ++ if (ii->dqi_valid & IIF_IGRACE) ++ qmblk->dq_ugid_info[type].iexpire = ii->dqi_igrace; ++ return 0; ++} ++ ++static int vz_set_dqinfo(struct super_block *sb, int type, ++ struct if_dqinfo *ii) ++{ ++ struct vz_quota_master *qmblk; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ err = __vz_set_dqinfo(qmblk, type, ii); ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ return err; ++} ++ ++#ifdef CONFIG_QUOTA_COMPAT ++ ++#define Q_GETQUOTI_SIZE 1024 ++ ++#define UGID2DQBLK(dst, src) \ ++ do { \ ++ (dst).dqb_ihardlimit = (src)->qugid_stat.ihardlimit; \ ++ (dst).dqb_isoftlimit = (src)->qugid_stat.isoftlimit; \ ++ (dst).dqb_curinodes = (src)->qugid_stat.icurrent; \ ++ /* in 1K blocks */ \ ++ (dst).dqb_bhardlimit = (src)->qugid_stat.bhardlimit >> 10; \ ++ /* in 1K blocks */ \ ++ (dst).dqb_bsoftlimit = (src)->qugid_stat.bsoftlimit >> 10; \ ++ /* in bytes, 64 bit */ \ ++ (dst).dqb_curspace = (src)->qugid_stat.bcurrent; \ ++ (dst).dqb_btime = (src)->qugid_stat.btime; \ ++ (dst).dqb_itime = (src)->qugid_stat.itime; \ ++ } while (0) ++ ++static int vz_get_quoti(struct super_block *sb, int type, qid_t idx, ++ struct v2_disk_dqblk *dqblk) ++{ ++ struct vz_quota_master *qmblk; ++ struct v2_disk_dqblk data; ++ struct vz_quota_ugid *ugid; ++ int count; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ ++ down(&qmblk->dq_sem); ++ for (ugid = vzquota_get_byindex(qmblk, idx, type), count = 0; ++ ugid != NULL && count < Q_GETQUOTI_SIZE; ++ count++) ++ { ++ qmblk_data_read_lock(qmblk); ++ UGID2DQBLK(data, ugid); ++ qmblk_data_read_unlock(qmblk); ++ data.dqb_id = ugid->qugid_id; ++ if (copy_to_user(dqblk, &data, sizeof(data))) ++ goto fault; ++ dqblk++; ++ ++ /* Find next entry */ ++ ugid = vzquota_get_next(qmblk, ugid); ++ BUG_ON(ugid != NULL && ugid->qugid_type != type); ++ } ++ err = count; ++out_ugid: ++ up(&qmblk->dq_sem); ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ ++ return err; ++ ++fault: ++ err = count ? count : -EFAULT; ++ goto out_ugid; ++} ++ ++#endif ++ ++struct quotactl_ops vz_quotactl_operations = { ++ quota_on: vz_quota_on, ++ quota_off: vz_quota_off, ++ quota_sync: vz_quota_sync, ++ get_info: vz_get_dqinfo, ++ set_info: vz_set_dqinfo, ++ get_dqblk: vz_get_dqblk, ++ set_dqblk: vz_set_dqblk, ++#ifdef CONFIG_QUOTA_COMPAT ++ get_quoti: vz_get_quoti ++#endif ++}; ++ ++ ++/* ---------------------------------------------------------------------- ++ * Management interface for host system admins. ++ * --------------------------------------------------------------------- */ ++ ++static int quota_ugid_addstat(unsigned int quota_id, unsigned int ugid_size, ++ struct vz_quota_iface *u_ugid_buf) ++{ ++ struct vz_quota_master *qmblk; ++ int ret; ++ ++ down(&vz_quota_sem); ++ ++ ret = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ ret = -EBUSY; ++ if (qmblk->dq_state != VZDQ_STARTING) ++ goto out; /* working quota doesn't accept new ugids */ ++ ++ ret = 0; ++ /* start to add ugids */ ++ for (ret = 0; ret < ugid_size; ret++) { ++ struct vz_quota_iface ugid_buf; ++ struct vz_quota_ugid *ugid; ++ ++ if (copy_from_user(&ugid_buf, u_ugid_buf, sizeof(ugid_buf))) ++ break; ++ ++ if (ugid_buf.qi_type >= MAXQUOTAS) ++ break; /* bad quota type - this is the only check */ ++ ++ ugid = vzquota_find_ugid(qmblk, ++ ugid_buf.qi_id, ugid_buf.qi_type, 0); ++ if (ugid == VZ_QUOTA_UGBAD) { ++ qmblk->dq_flags |= VZDQUG_FIXED_SET; ++ break; /* limit reached */ ++ } ++ ++ /* update usage/limits ++ * we can copy the data without the lock, because the data ++ * cannot be modified in VZDQ_STARTING state */ ++ ugid->qugid_stat = ugid_buf.qi_stat; ++ ++ vzquota_put_ugid(qmblk, ugid); ++ ++ u_ugid_buf++; /* next user buffer */ ++ } ++out: ++ up(&vz_quota_sem); ++ ++ return ret; ++} ++ ++static int quota_ugid_setgrace(unsigned int quota_id, ++ struct dq_info u_dq_info[]) ++{ ++ struct vz_quota_master *qmblk; ++ struct dq_info dq_info[MAXQUOTAS]; ++ struct dq_info *target; ++ int err, type; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EBUSY; ++ if (qmblk->dq_state != VZDQ_STARTING) ++ goto out; /* working quota doesn't accept changing options */ ++ ++ err = -EFAULT; ++ if (copy_from_user(dq_info, u_dq_info, sizeof(dq_info))) ++ goto out; ++ ++ err = 0; ++ ++ /* update in qmblk */ ++ for (type = 0; type < MAXQUOTAS; type ++) { ++ target = &qmblk->dq_ugid_info[type]; ++ target->bexpire = dq_info[type].bexpire; ++ target->iexpire = dq_info[type].iexpire; ++ } ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++static int do_quota_ugid_getstat(struct vz_quota_master *qmblk, int index, int size, ++ struct vz_quota_iface *u_ugid_buf) ++{ ++ int type, count; ++ struct vz_quota_ugid *ugid; ++ ++ if (QTREE_LEAFNUM(qmblk->dq_uid_tree) + ++ QTREE_LEAFNUM(qmblk->dq_gid_tree) ++ <= index) ++ return 0; ++ ++ count = 0; ++ ++ type = index < QTREE_LEAFNUM(qmblk->dq_uid_tree) ? USRQUOTA : GRPQUOTA; ++ if (type == GRPQUOTA) ++ index -= QTREE_LEAFNUM(qmblk->dq_uid_tree); ++ ++ /* loop through ugid and then qgid quota */ ++repeat: ++ for (ugid = vzquota_get_byindex(qmblk, index, type); ++ ugid != NULL && count < size; ++ ugid = vzquota_get_next(qmblk, ugid), count++) ++ { ++ struct vz_quota_iface ugid_buf; ++ ++ /* form interface buffer and send in to user-level */ ++ qmblk_data_read_lock(qmblk); ++ memcpy(&ugid_buf.qi_stat, &ugid->qugid_stat, ++ sizeof(ugid_buf.qi_stat)); ++ qmblk_data_read_unlock(qmblk); ++ ugid_buf.qi_id = ugid->qugid_id; ++ ugid_buf.qi_type = ugid->qugid_type; ++ ++ if (copy_to_user(u_ugid_buf, &ugid_buf, sizeof(ugid_buf))) ++ goto fault; ++ u_ugid_buf++; /* next portion of user buffer */ ++ } ++ ++ if (type == USRQUOTA && count < size) { ++ type = GRPQUOTA; ++ index = 0; ++ goto repeat; ++ } ++ ++ return count; ++ ++fault: ++ return count ? count : -EFAULT; ++} ++ ++static int quota_ugid_getstat(unsigned int quota_id, ++ int index, int size, struct vz_quota_iface *u_ugid_buf) ++{ ++ struct vz_quota_master *qmblk; ++ int err; ++ ++ if (index < 0 || size < 0) ++ return -EINVAL; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ down(&qmblk->dq_sem); ++ err = do_quota_ugid_getstat(qmblk, index, size, u_ugid_buf); ++ up(&qmblk->dq_sem); ++ ++out: ++ up(&vz_quota_sem); ++ return err; ++} ++ ++static int quota_ugid_getgrace(unsigned int quota_id, ++ struct dq_info u_dq_info[]) ++{ ++ struct vz_quota_master *qmblk; ++ struct dq_info dq_info[MAXQUOTAS]; ++ struct dq_info *target; ++ int err, type; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = 0; ++ /* update from qmblk */ ++ for (type = 0; type < MAXQUOTAS; type ++) { ++ target = &qmblk->dq_ugid_info[type]; ++ dq_info[type].bexpire = target->bexpire; ++ dq_info[type].iexpire = target->iexpire; ++ dq_info[type].flags = target->flags; ++ } ++ ++ if (copy_to_user(u_dq_info, dq_info, sizeof(dq_info))) ++ err = -EFAULT; ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++static int quota_ugid_getconfig(unsigned int quota_id, ++ struct vz_quota_ugid_stat *info) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid_stat kinfo; ++ int err; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = 0; ++ kinfo.limit = qmblk->dq_ugid_max; ++ kinfo.count = qmblk->dq_ugid_count; ++ kinfo.flags = qmblk->dq_flags; ++ ++ if (copy_to_user(info, &kinfo, sizeof(kinfo))) ++ err = -EFAULT; ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++static int quota_ugid_setconfig(unsigned int quota_id, ++ struct vz_quota_ugid_stat *info) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid_stat kinfo; ++ int err; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EFAULT; ++ if (copy_from_user(&kinfo, info, sizeof(kinfo))) ++ goto out; ++ ++ err = 0; ++ qmblk->dq_ugid_max = kinfo.limit; ++ if (qmblk->dq_state == VZDQ_STARTING) { ++ qmblk->dq_flags = kinfo.flags; ++ if (qmblk->dq_flags & VZDQUG_ON) ++ qmblk->dq_flags |= VZDQ_USRQUOTA | VZDQ_GRPQUOTA; ++ } ++ ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++static int quota_ugid_setlimit(unsigned int quota_id, ++ struct vz_quota_ugid_setlimit *u_lim) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid_setlimit lim; ++ int err; ++ ++ down(&vz_quota_sem); ++ ++ err = -ESRCH; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EFAULT; ++ if (copy_from_user(&lim, u_lim, sizeof(lim))) ++ goto out; ++ ++ err = __vz_set_dqblk(qmblk, lim.type, lim.id, &lim.dqb); ++ ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++static int quota_ugid_setinfo(unsigned int quota_id, ++ struct vz_quota_ugid_setinfo *u_info) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid_setinfo info; ++ int err; ++ ++ down(&vz_quota_sem); ++ ++ err = -ESRCH; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EFAULT; ++ if (copy_from_user(&info, u_info, sizeof(info))) ++ goto out; ++ ++ err = __vz_set_dqinfo(qmblk, info.type, &info.dqi); ++ ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++/* ++ * This is a system call to maintain UGID quotas ++ * Note this call is allowed to run ONLY from VE0 ++ */ ++long do_vzquotaugidctl(struct vzctl_quotaugidctl *qub) ++{ ++ int ret; ++ ++ ret = -EPERM; ++ /* access allowed only from root of VE0 */ ++ if (!capable(CAP_SYS_RESOURCE) || ++ !capable(CAP_SYS_ADMIN)) ++ goto out; ++ ++ switch (qub->cmd) { ++ case VZ_DQ_UGID_GETSTAT: ++ ret = quota_ugid_getstat(qub->quota_id, ++ qub->ugid_index, qub->ugid_size, ++ (struct vz_quota_iface *)qub->addr); ++ break; ++ case VZ_DQ_UGID_ADDSTAT: ++ ret = quota_ugid_addstat(qub->quota_id, qub->ugid_size, ++ (struct vz_quota_iface *)qub->addr); ++ break; ++ case VZ_DQ_UGID_GETGRACE: ++ ret = quota_ugid_getgrace(qub->quota_id, ++ (struct dq_info *)qub->addr); ++ break; ++ case VZ_DQ_UGID_SETGRACE: ++ ret = quota_ugid_setgrace(qub->quota_id, ++ (struct dq_info *)qub->addr); ++ break; ++ case VZ_DQ_UGID_GETCONFIG: ++ ret = quota_ugid_getconfig(qub->quota_id, ++ (struct vz_quota_ugid_stat *)qub->addr); ++ break; ++ case VZ_DQ_UGID_SETCONFIG: ++ ret = quota_ugid_setconfig(qub->quota_id, ++ (struct vz_quota_ugid_stat *)qub->addr); ++ break; ++ case VZ_DQ_UGID_SETLIMIT: ++ ret = quota_ugid_setlimit(qub->quota_id, ++ (struct vz_quota_ugid_setlimit *) ++ qub->addr); ++ break; ++ case VZ_DQ_UGID_SETINFO: ++ ret = quota_ugid_setinfo(qub->quota_id, ++ (struct vz_quota_ugid_setinfo *) ++ qub->addr); ++ break; ++ default: ++ ret = -EINVAL; ++ goto out; ++ } ++out: ++ return ret; ++} ++ ++static void ugid_quota_on_sb(struct super_block *sb) ++{ ++ struct super_block *real_sb; ++ struct vz_quota_master *qmblk; ++ ++ if (!sb->s_op->get_quota_root) ++ return; ++ ++ real_sb = sb->s_op->get_quota_root(sb)->i_sb; ++ if (real_sb->dq_op != &vz_quota_operations) ++ return; ++ ++ sb->dq_op = &vz_quota_operations2; ++ sb->s_qcop = &vz_quotactl_operations; ++ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); ++ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); ++ sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format; ++ sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ if ((qmblk == NULL) || (qmblk == VZ_QUOTA_BAD)) ++ return; ++ down(&vz_quota_sem); ++ if (qmblk->dq_flags & VZDQ_USRQUOTA) ++ sb->s_dquot.flags |= DQUOT_USR_ENABLED; ++ if (qmblk->dq_flags & VZDQ_GRPQUOTA) ++ sb->s_dquot.flags |= DQUOT_GRP_ENABLED; ++ up(&vz_quota_sem); ++ qmblk_put(qmblk); ++} ++ ++static void ugid_quota_off_sb(struct super_block *sb) ++{ ++ /* can't make quota off on mounted super block */ ++ BUG_ON(sb->s_root != NULL); ++} ++ ++static int ugid_notifier_call(struct vnotifier_block *self, ++ unsigned long n, void *data, int old_ret) ++{ ++ struct virt_info_quota *viq; ++ ++ viq = (struct virt_info_quota *)data; ++ ++ switch (n) { ++ case VIRTINFO_QUOTA_ON: ++ ugid_quota_on_sb(viq->super); ++ break; ++ case VIRTINFO_QUOTA_OFF: ++ ugid_quota_off_sb(viq->super); ++ break; ++ case VIRTINFO_QUOTA_GETSTAT: ++ break; ++ default: ++ return old_ret; ++ } ++ return NOTIFY_OK; ++} ++ ++static struct vnotifier_block ugid_notifier_block = { ++ .notifier_call = ugid_notifier_call, ++}; ++ ++/* ---------------------------------------------------------------------- ++ * Init/exit. ++ * --------------------------------------------------------------------- */ ++ ++struct quota_format_type vz_quota_empty_v2_format = { ++ qf_fmt_id: QFMT_VFS_V0, ++ qf_ops: NULL, ++ qf_owner: THIS_MODULE ++}; ++ ++int vzquota_ugid_init() ++{ ++ int err; ++ ++ vz_quota_ugid_cachep = kmem_cache_create("vz_quota_ugid", ++ sizeof(struct vz_quota_ugid), ++ 0, SLAB_HWCACHE_ALIGN, ++ NULL, NULL); ++ if (vz_quota_ugid_cachep == NULL) ++ goto err_slab; ++ ++ err = register_quota_format(&vz_quota_empty_v2_format); ++ if (err) ++ goto err_reg; ++ ++ virtinfo_notifier_register(VITYPE_QUOTA, &ugid_notifier_block); ++ return 0; ++ ++err_reg: ++ kmem_cache_destroy(vz_quota_ugid_cachep); ++ return err; ++ ++err_slab: ++ printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n"); ++ return -ENOMEM; ++} ++ ++void vzquota_ugid_release() ++{ ++ virtinfo_notifier_unregister(VITYPE_QUOTA, &ugid_notifier_block); ++ unregister_quota_format(&vz_quota_empty_v2_format); ++ ++ if (kmem_cache_destroy(vz_quota_ugid_cachep)) ++ printk(KERN_ERR "VZQUOTA: kmem_cache_destroy failed\n"); ++} +diff -uprN linux-2.6.15.orig/fs/vzdquot.c linux-2.6.15-ve025stab014/fs/vzdquot.c +--- linux-2.6.15.orig/fs/vzdquot.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/fs/vzdquot.c 2006-01-27 14:48:09.000000000 +0300 +@@ -0,0 +1,1705 @@ ++/* ++ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains the core of Virtuozzo disk quota implementation: ++ * maintenance of VZDQ information in inodes, ++ * external interfaces, ++ * module entry. ++ */ ++ ++#include <linux/config.h> ++#include <linux/kernel.h> ++#include <linux/string.h> ++#include <linux/list.h> ++#include <asm/atomic.h> ++#include <linux/spinlock.h> ++#include <asm/semaphore.h> ++#include <linux/slab.h> ++#include <linux/fs.h> ++#include <linux/dcache.h> ++#include <linux/quota.h> ++#include <linux/rcupdate.h> ++#include <linux/module.h> ++#include <asm/uaccess.h> ++#include <linux/vzctl.h> ++#include <linux/vzctl_quota.h> ++#include <linux/vzquota.h> ++#include <linux/virtinfo.h> ++#include <linux/vzdq_tree.h> ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Locking ++ * ++ * ---------------------------------------------------------------------- */ ++ ++/* ++ * Serializes on/off and all other do_vzquotactl operations. ++ * Protects qmblk hash. ++ */ ++struct semaphore vz_quota_sem; ++ ++/* ++ * Data access locks ++ * inode_qmblk ++ * protects qmblk pointers in all inodes and qlnk content in general ++ * (but not qmblk content); ++ * also protects related qmblk invalidation procedures; ++ * can't be per-inode because of vzquota_dtree_qmblk complications ++ * and problems with serialization with quota_on, ++ * but can be per-superblock; ++ * qmblk_data ++ * protects qmblk fields (such as current usage) ++ * quota_data ++ * protects charge/uncharge operations, thus, implies ++ * qmblk_data lock and, if CONFIG_VZ_QUOTA_UGID, inode_qmblk lock ++ * (to protect ugid pointers). ++ * ++ * Lock order: ++ * inode_qmblk_lock -> dcache_lock ++ * inode_qmblk_lock -> qmblk_data ++ */ ++static spinlock_t vzdq_qmblk_lock = SPIN_LOCK_UNLOCKED; ++ ++inline void inode_qmblk_lock(struct super_block *sb) ++{ ++ spin_lock(&vzdq_qmblk_lock); ++} ++ ++inline void inode_qmblk_unlock(struct super_block *sb) ++{ ++ spin_unlock(&vzdq_qmblk_lock); ++} ++ ++inline void qmblk_data_read_lock(struct vz_quota_master *qmblk) ++{ ++ spin_lock(&qmblk->dq_data_lock); ++} ++ ++inline void qmblk_data_read_unlock(struct vz_quota_master *qmblk) ++{ ++ spin_unlock(&qmblk->dq_data_lock); ++} ++ ++inline void qmblk_data_write_lock(struct vz_quota_master *qmblk) ++{ ++ spin_lock(&qmblk->dq_data_lock); ++} ++ ++inline void qmblk_data_write_unlock(struct vz_quota_master *qmblk) ++{ ++ spin_unlock(&qmblk->dq_data_lock); ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Master hash table handling. ++ * ++ * SMP not safe, serialied by vz_quota_sem within quota syscalls ++ * ++ * --------------------------------------------------------------------- */ ++ ++static kmem_cache_t *vzquota_cachep; ++ ++/* ++ * Hash function. ++ */ ++#define QHASH_BITS 6 ++#define VZ_QUOTA_HASH_SIZE (1 << QHASH_BITS) ++#define QHASH_MASK (VZ_QUOTA_HASH_SIZE - 1) ++ ++struct list_head vzquota_hash_table[VZ_QUOTA_HASH_SIZE]; ++int vzquota_hash_size = VZ_QUOTA_HASH_SIZE; ++ ++static inline int vzquota_hash_func(unsigned int qid) ++{ ++ return (((qid >> QHASH_BITS) ^ qid) & QHASH_MASK); ++} ++ ++/** ++ * vzquota_alloc_master - alloc and instantiate master quota record ++ * ++ * Returns: ++ * pointer to newly created record if SUCCESS ++ * -ENOMEM if out of memory ++ * -EEXIST if record with given quota_id already exist ++ */ ++struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id, ++ struct vz_quota_stat *qstat) ++{ ++ int err; ++ struct vz_quota_master *qmblk; ++ ++ err = -EEXIST; ++ if (vzquota_find_master(quota_id) != NULL) ++ goto out; ++ ++ err = -ENOMEM; ++ qmblk = kmem_cache_alloc(vzquota_cachep, SLAB_KERNEL); ++ if (qmblk == NULL) ++ goto out; ++#ifdef CONFIG_VZ_QUOTA_UGID ++ qmblk->dq_uid_tree = quotatree_alloc(); ++ if (!qmblk->dq_uid_tree) ++ goto out_free; ++ ++ qmblk->dq_gid_tree = quotatree_alloc(); ++ if (!qmblk->dq_gid_tree) ++ goto out_free_tree; ++#endif ++ ++ qmblk->dq_state = VZDQ_STARTING; ++ init_MUTEX(&qmblk->dq_sem); ++ spin_lock_init(&qmblk->dq_data_lock); ++ ++ qmblk->dq_id = quota_id; ++ qmblk->dq_stat = qstat->dq_stat; ++ qmblk->dq_info = qstat->dq_info; ++ qmblk->dq_root_dentry = NULL; ++ qmblk->dq_root_mnt = NULL; ++ qmblk->dq_sb = NULL; ++ qmblk->dq_ugid_count = 0; ++ qmblk->dq_ugid_max = 0; ++ qmblk->dq_flags = 0; ++ memset(qmblk->dq_ugid_info, 0, sizeof(qmblk->dq_ugid_info)); ++ INIT_LIST_HEAD(&qmblk->dq_ilink_list); ++ ++ atomic_set(&qmblk->dq_count, 1); ++ ++ /* insert in hash chain */ ++ list_add(&qmblk->dq_hash, ++ &vzquota_hash_table[vzquota_hash_func(quota_id)]); ++ ++ /* success */ ++ return qmblk; ++ ++out_free_tree: ++ quotatree_free(qmblk->dq_uid_tree, NULL); ++out_free: ++ kmem_cache_free(vzquota_cachep, qmblk); ++out: ++ return ERR_PTR(err); ++} ++ ++static struct vz_quota_master *vzquota_alloc_fake(void) ++{ ++ struct vz_quota_master *qmblk; ++ ++ qmblk = kmem_cache_alloc(vzquota_cachep, SLAB_KERNEL); ++ if (qmblk == NULL) ++ return NULL; ++ memset(qmblk, 0, sizeof(*qmblk)); ++ qmblk->dq_state = VZDQ_STOPING; ++ qmblk->dq_flags = VZDQ_NOQUOT; ++ spin_lock_init(&qmblk->dq_data_lock); ++ INIT_LIST_HEAD(&qmblk->dq_ilink_list); ++ atomic_set(&qmblk->dq_count, 1); ++ return qmblk; ++} ++ ++/** ++ * vzquota_find_master - find master record with given id ++ * ++ * Returns qmblk without touching its refcounter. ++ * Called under vz_quota_sem. ++ */ ++struct vz_quota_master *vzquota_find_master(unsigned int quota_id) ++{ ++ int i; ++ struct vz_quota_master *qp; ++ ++ i = vzquota_hash_func(quota_id); ++ list_for_each_entry(qp, &vzquota_hash_table[i], dq_hash) { ++ if (qp->dq_id == quota_id) ++ return qp; ++ } ++ return NULL; ++} ++ ++/** ++ * vzquota_free_master - release resources taken by qmblk, freeing memory ++ * ++ * qmblk is assumed to be already taken out from the hash. ++ * Should be called outside vz_quota_sem. ++ */ ++void vzquota_free_master(struct vz_quota_master *qmblk) ++{ ++#ifdef CONFIG_VZ_QUOTA_UGID ++ vzquota_kill_ugid(qmblk); ++#endif ++ BUG_ON(!list_empty(&qmblk->dq_ilink_list)); ++ kmem_cache_free(vzquota_cachep, qmblk); ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Passing quota information through current ++ * ++ * Used in inode -> qmblk lookup at inode creation stage (since at that ++ * time there are no links between the inode being created and its parent ++ * directory). ++ * ++ * --------------------------------------------------------------------- */ ++ ++#define VZDQ_CUR_MAGIC 0x57d0fee2 ++ ++static inline int vzquota_cur_qmblk_check(void) ++{ ++ return current->magic == VZDQ_CUR_MAGIC; ++} ++ ++static inline struct inode *vzquota_cur_qmblk_fetch(void) ++{ ++ return current->ino; ++} ++ ++static inline void vzquota_cur_qmblk_set(struct inode *data) ++{ ++ struct task_struct *tsk; ++ ++ tsk = current; ++ tsk->magic = VZDQ_CUR_MAGIC; ++ tsk->ino = data; ++} ++ ++#if 0 ++static inline void vzquota_cur_qmblk_reset(void) ++{ ++ current->magic = 0; ++} ++#endif ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Superblock quota operations ++ * ++ * --------------------------------------------------------------------- */ ++ ++/* ++ * Kernel structure abuse. ++ * We use files[0] pointer as an int variable: ++ * reference counter of how many quota blocks uses this superblock. ++ * files[1] is used for generations structure which helps us to track ++ * when traversing of dentries is really required. ++ */ ++#define __VZ_QUOTA_NOQUOTA(sb) sb->s_dquot.vzdq_master ++#define __VZ_QUOTA_TSTAMP(sb) ((struct timeval *)\ ++ &sb->s_dquot.dqio_sem) ++ ++#if defined(VZ_QUOTA_UNLOAD) ++ ++#define __VZ_QUOTA_SBREF(sb) sb->s_dquot.vzdq_count ++ ++struct dquot_operations *orig_dq_op; ++struct quotactl_ops *orig_dq_cop; ++ ++/** ++ * quota_get_super - account for new a quoted tree under the superblock ++ * ++ * One superblock can have multiple directory subtrees with different VZ ++ * quotas. We keep a counter of such subtrees and set VZ quota operations or ++ * reset the default ones. ++ * ++ * Called under vz_quota_sem (from quota_on). ++ */ ++int vzquota_get_super(struct super_block *sb) ++{ ++ if (sb->dq_op != &vz_quota_operations) { ++ down(&sb->s_dquot.dqonoff_sem); ++ if (sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) { ++ up(&sb->s_dquot.dqonoff_sem); ++ return -EEXIST; ++ } ++ if (orig_dq_op == NULL && sb->dq_op != NULL) ++ orig_dq_op = sb->dq_op; ++ sb->dq_op = &vz_quota_operations; ++ if (orig_dq_cop == NULL && sb->s_qcop != NULL) ++ orig_dq_cop = sb->s_qcop; ++ /* XXX this may race with sys_quotactl */ ++#ifdef CONFIG_VZ_QUOTA_UGID ++ sb->s_qcop = &vz_quotactl_operations; ++#else ++ sb->s_qcop = NULL; ++#endif ++ do_gettimeofday(__VZ_QUOTA_TSTAMP(sb)); ++ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); ++ ++ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); ++ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); ++ sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format; ++ sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format; ++ /* ++ * To get quotaops.h call us we need to mark superblock ++ * as having quota. These flags mark the moment when ++ * our dq_op start to be called. ++ * ++ * The ordering of dq_op and s_dquot.flags assignment ++ * needs to be enforced, but other CPUs do not do rmb() ++ * between s_dquot.flags and dq_op accesses. ++ */ ++ wmb(); synchronize_sched(); ++ sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED; ++ __module_get(THIS_MODULE); ++ up(&sb->s_dquot.dqonoff_sem); ++ } ++ /* protected by vz_quota_sem */ ++ __VZ_QUOTA_SBREF(sb)++; ++ return 0; ++} ++ ++/** ++ * quota_put_super - release superblock when one quota tree goes away ++ * ++ * Called under vz_quota_sem. ++ */ ++void vzquota_put_super(struct super_block *sb) ++{ ++ int count; ++ ++ count = --__VZ_QUOTA_SBREF(sb); ++ if (count == 0) { ++ down(&sb->s_dquot.dqonoff_sem); ++ sb->s_dquot.flags = 0; ++ wmb(); synchronize_sched(); ++ sema_init(&sb->s_dquot.dqio_sem, 1); ++ sb->s_qcop = orig_dq_cop; ++ sb->dq_op = orig_dq_op; ++ inode_qmblk_lock(sb); ++ quota_gen_put(SB_QGEN(sb)); ++ SB_QGEN(sb) = NULL; ++ /* release qlnk's without qmblk */ ++ remove_inode_quota_links_list(&non_vzquota_inodes_lh, ++ sb, NULL); ++ /* ++ * Races with quota initialization: ++ * after this inode_qmblk_unlock all inode's generations are ++ * invalidated, quota_inode_qmblk checks superblock operations. ++ */ ++ inode_qmblk_unlock(sb); ++ /* ++ * Module refcounting: in theory, this is the best place ++ * to call module_put(THIS_MODULE). ++ * In reality, it can't be done because we can't be sure that ++ * other CPUs do not enter our code segment through dq_op ++ * cached long time ago. Quotaops interface isn't supposed to ++ * go into modules currently (that is, into unloadable ++ * modules). By omitting module_put, our module isn't ++ * unloadable. ++ */ ++ up(&sb->s_dquot.dqonoff_sem); ++ } ++} ++ ++#else ++ ++struct vzquota_new_sop { ++ struct super_operations new_op; ++ struct super_operations *old_op; ++}; ++ ++/** ++ * vzquota_shutdown_super - callback on umount ++ */ ++void vzquota_shutdown_super(struct super_block *sb) ++{ ++ struct vz_quota_master *qmblk; ++ struct vzquota_new_sop *sop; ++ ++ qmblk = __VZ_QUOTA_NOQUOTA(sb); ++ __VZ_QUOTA_NOQUOTA(sb) = NULL; ++ if (qmblk != NULL) ++ qmblk_put(qmblk); ++ sop = container_of(sb->s_op, struct vzquota_new_sop, new_op); ++ sb->s_op = sop->old_op; ++ kfree(sop); ++ (*sb->s_op->put_super)(sb); ++} ++ ++/** ++ * vzquota_get_super - account for new a quoted tree under the superblock ++ * ++ * One superblock can have multiple directory subtrees with different VZ ++ * quotas. ++ * ++ * Called under vz_quota_sem (from vzquota_on). ++ */ ++int vzquota_get_super(struct super_block *sb) ++{ ++ struct vz_quota_master *qnew; ++ struct vzquota_new_sop *sop; ++ int err; ++ ++ down(&sb->s_dquot.dqonoff_sem); ++ err = -EEXIST; ++ if ((sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) && ++ sb->dq_op != &vz_quota_operations) ++ goto out_up; ++ ++ /* ++ * This allocation code should be under sb->dq_op check below, but ++ * it doesn't really matter... ++ */ ++ if (__VZ_QUOTA_NOQUOTA(sb) == NULL) { ++ qnew = vzquota_alloc_fake(); ++ if (qnew == NULL) ++ goto out_up; ++ __VZ_QUOTA_NOQUOTA(sb) = qnew; ++ } ++ ++ if (sb->dq_op != &vz_quota_operations) { ++ sop = kmalloc(sizeof(*sop), GFP_KERNEL); ++ if (sop == NULL) { ++ vzquota_free_master(__VZ_QUOTA_NOQUOTA(sb)); ++ __VZ_QUOTA_NOQUOTA(sb) = NULL; ++ goto out_up; ++ } ++ memcpy(&sop->new_op, sb->s_op, sizeof(sop->new_op)); ++ sop->new_op.put_super = &vzquota_shutdown_super; ++ sop->old_op = sb->s_op; ++ sb->s_op = &sop->new_op; ++ ++ sb->dq_op = &vz_quota_operations; ++#ifdef CONFIG_VZ_QUOTA_UGID ++ sb->s_qcop = &vz_quotactl_operations; ++#else ++ sb->s_qcop = NULL; ++#endif ++ do_gettimeofday(__VZ_QUOTA_TSTAMP(sb)); ++ ++ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); ++ /* these 2 list heads are checked in sync_dquots() */ ++ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); ++ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); ++ sb->s_dquot.info[USRQUOTA].dqi_format = ++ &vz_quota_empty_v2_format; ++ sb->s_dquot.info[GRPQUOTA].dqi_format = ++ &vz_quota_empty_v2_format; ++ ++ /* ++ * To get quotaops.h to call us we need to mark superblock ++ * as having quota. These flags mark the moment when ++ * our dq_op start to be called. ++ * ++ * The ordering of dq_op and s_dquot.flags assignment ++ * needs to be enforced, but other CPUs do not do rmb() ++ * between s_dquot.flags and dq_op accesses. ++ */ ++ wmb(); synchronize_sched(); ++ sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED; ++ } ++ err = 0; ++ ++out_up: ++ up(&sb->s_dquot.dqonoff_sem); ++ return err; ++} ++ ++/** ++ * vzquota_put_super - one quota tree less on this superblock ++ * ++ * Called under vz_quota_sem. ++ */ ++void vzquota_put_super(struct super_block *sb) ++{ ++ /* ++ * Even if this put is the last one, ++ * sb->s_dquot.flags can't be cleared, because otherwise vzquota_drop ++ * won't be called and the remaining qmblk references won't be put. ++ */ ++} ++ ++#endif ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Helpers for inode -> qmblk link maintenance ++ * ++ * --------------------------------------------------------------------- */ ++ ++#define __VZ_QUOTA_EMPTY ((void *)0xbdbdbdbd) ++#define VZ_QUOTA_IS_NOQUOTA(qm, sb) ((qm)->dq_flags & VZDQ_NOQUOT) ++#define VZ_QUOTA_EMPTY_IOPS (&vfs_empty_iops) ++extern struct inode_operations vfs_empty_iops; ++ ++static int VZ_QUOTA_IS_ACTUAL(struct inode *inode) ++{ ++ struct vz_quota_master *qmblk; ++ ++ qmblk = INODE_QLNK(inode)->qmblk; ++ if (qmblk == VZ_QUOTA_BAD) ++ return 1; ++ if (qmblk == __VZ_QUOTA_EMPTY) ++ return 0; ++ if (qmblk->dq_flags & VZDQ_NOACT) ++ /* not actual (invalidated) qmblk */ ++ return 0; ++ return 1; ++} ++ ++static inline int vzquota_qlnk_is_empty(struct vz_quota_ilink *qlnk) ++{ ++ return qlnk->qmblk == __VZ_QUOTA_EMPTY; ++} ++ ++static inline void vzquota_qlnk_set_empty(struct vz_quota_ilink *qlnk) ++{ ++ qlnk->qmblk = __VZ_QUOTA_EMPTY; ++ qlnk->origin = VZ_QUOTAO_SETE; ++} ++ ++void vzquota_qlnk_init(struct vz_quota_ilink *qlnk) ++{ ++ memset(qlnk, 0, sizeof(*qlnk)); ++ INIT_LIST_HEAD(&qlnk->list); ++ vzquota_qlnk_set_empty(qlnk); ++ qlnk->origin = VZ_QUOTAO_INIT; ++} ++ ++void vzquota_qlnk_destroy(struct vz_quota_ilink *qlnk) ++{ ++ might_sleep(); ++ if (vzquota_qlnk_is_empty(qlnk)) ++ return; ++#if defined(CONFIG_VZ_QUOTA_UGID) ++ if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) { ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid *quid, *qgid; ++ qmblk = qlnk->qmblk; ++ quid = qlnk->qugid[USRQUOTA]; ++ qgid = qlnk->qugid[GRPQUOTA]; ++ if (quid != NULL || qgid != NULL) { ++ down(&qmblk->dq_sem); ++ if (qgid != NULL) ++ vzquota_put_ugid(qmblk, qgid); ++ if (quid != NULL) ++ vzquota_put_ugid(qmblk, quid); ++ up(&qmblk->dq_sem); ++ } ++ } ++#endif ++ if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qlnk->qmblk); ++ qlnk->origin = VZ_QUOTAO_DESTR; ++} ++ ++/** ++ * vzquota_qlnk_swap - swap inode's and temporary vz_quota_ilink contents ++ * @qlt: temporary ++ * @qli: inode's ++ * ++ * Locking is provided by the caller (depending on the context). ++ * After swap, @qli is inserted into the corresponding dq_ilink_list, ++ * @qlt list is reinitialized. ++ */ ++static void vzquota_qlnk_swap(struct vz_quota_ilink *qlt, ++ struct vz_quota_ilink *qli) ++{ ++ struct vz_quota_master *qb; ++ struct vz_quota_ugid *qu; ++ int i; ++ ++ qb = qlt->qmblk; ++ qlt->qmblk = qli->qmblk; ++ qli->qmblk = qb; ++ list_del_init(&qli->list); ++ if (qb != __VZ_QUOTA_EMPTY && qb != VZ_QUOTA_BAD) ++ list_add(&qli->list, &qb->dq_ilink_list); ++ INIT_LIST_HEAD(&qlt->list); ++ qli->origin = VZ_QUOTAO_SWAP; ++ ++ for (i = 0; i < MAXQUOTAS; i++) { ++ qu = qlt->qugid[i]; ++ qlt->qugid[i] = qli->qugid[i]; ++ qli->qugid[i] = qu; ++ } ++} ++ ++/** ++ * vzquota_qlnk_reinit_locked - destroy qlnk content, called under locks ++ * ++ * Called under dcache_lock and inode_qmblk locks. ++ * Returns 1 if locks were dropped inside, 0 if atomic. ++ */ ++static int vzquota_qlnk_reinit_locked(struct vz_quota_ilink *qlnk, ++ struct inode *inode) ++{ ++ if (vzquota_qlnk_is_empty(qlnk)) ++ return 0; ++ if (qlnk->qmblk == VZ_QUOTA_BAD) { ++ vzquota_qlnk_set_empty(qlnk); ++ return 0; ++ } ++ spin_unlock(&dcache_lock); ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(qlnk); ++ vzquota_qlnk_init(qlnk); ++ inode_qmblk_lock(inode->i_sb); ++ spin_lock(&dcache_lock); ++ return 1; ++} ++ ++#if defined(CONFIG_VZ_QUOTA_UGID) ++/** ++ * vzquota_qlnk_reinit_attr - destroy and reinit qlnk content ++ * ++ * Similar to vzquota_qlnk_reinit_locked, called under different locks. ++ */ ++static int vzquota_qlnk_reinit_attr(struct vz_quota_ilink *qlnk, ++ struct inode *inode, ++ struct vz_quota_master *qmblk) ++{ ++ if (vzquota_qlnk_is_empty(qlnk)) ++ return 0; ++ /* may be optimized if qlnk->qugid all NULLs */ ++ qmblk_data_write_unlock(qmblk); ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(qlnk); ++ vzquota_qlnk_init(qlnk); ++ inode_qmblk_lock(inode->i_sb); ++ qmblk_data_write_lock(qmblk); ++ return 1; ++} ++#endif ++ ++/** ++ * vzquota_qlnk_fill - fill vz_quota_ilink content ++ * @qlnk: vz_quota_ilink to fill ++ * @inode: inode for which @qlnk is filled (i_sb, i_uid, i_gid) ++ * @qmblk: qmblk to which this @qlnk will belong ++ * ++ * Called under dcache_lock and inode_qmblk locks. ++ * Returns 1 if locks were dropped inside, 0 if atomic. ++ * @qlnk is expected to be empty. ++ */ ++static int vzquota_qlnk_fill(struct vz_quota_ilink *qlnk, ++ struct inode *inode, ++ struct vz_quota_master *qmblk) ++{ ++ if (qmblk != VZ_QUOTA_BAD) ++ qmblk_get(qmblk); ++ qlnk->qmblk = qmblk; ++ ++#if defined(CONFIG_VZ_QUOTA_UGID) ++ if (qmblk != VZ_QUOTA_BAD && ++ !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) && ++ (qmblk->dq_flags & VZDQUG_ON)) { ++ struct vz_quota_ugid *quid, *qgid; ++ ++ spin_unlock(&dcache_lock); ++ inode_qmblk_unlock(inode->i_sb); ++ ++ down(&qmblk->dq_sem); ++ quid = __vzquota_find_ugid(qmblk, inode->i_uid, USRQUOTA, 0); ++ qgid = __vzquota_find_ugid(qmblk, inode->i_gid, GRPQUOTA, 0); ++ up(&qmblk->dq_sem); ++ ++ inode_qmblk_lock(inode->i_sb); ++ spin_lock(&dcache_lock); ++ qlnk->qugid[USRQUOTA] = quid; ++ qlnk->qugid[GRPQUOTA] = qgid; ++ return 1; ++ } ++#endif ++ ++ return 0; ++} ++ ++#if defined(CONFIG_VZ_QUOTA_UGID) ++/** ++ * vzquota_qlnk_fill_attr - fill vz_quota_ilink content for uid, gid ++ * ++ * This function is a helper for vzquota_transfer, and differs from ++ * vzquota_qlnk_fill only by locking. ++ */ ++static int vzquota_qlnk_fill_attr(struct vz_quota_ilink *qlnk, ++ struct inode *inode, ++ struct iattr *iattr, ++ int mask, ++ struct vz_quota_master *qmblk) ++{ ++ qmblk_get(qmblk); ++ qlnk->qmblk = qmblk; ++ ++ if (mask) { ++ struct vz_quota_ugid *quid, *qgid; ++ ++ quid = qgid = NULL; /* to make gcc happy */ ++ if (!(mask & (1 << USRQUOTA))) ++ quid = vzquota_get_ugid(INODE_QLNK(inode)-> ++ qugid[USRQUOTA]); ++ if (!(mask & (1 << GRPQUOTA))) ++ qgid = vzquota_get_ugid(INODE_QLNK(inode)-> ++ qugid[GRPQUOTA]); ++ ++ qmblk_data_write_unlock(qmblk); ++ inode_qmblk_unlock(inode->i_sb); ++ ++ down(&qmblk->dq_sem); ++ if (mask & (1 << USRQUOTA)) ++ quid = __vzquota_find_ugid(qmblk, iattr->ia_uid, ++ USRQUOTA, 0); ++ if (mask & (1 << GRPQUOTA)) ++ qgid = __vzquota_find_ugid(qmblk, iattr->ia_gid, ++ GRPQUOTA, 0); ++ up(&qmblk->dq_sem); ++ ++ inode_qmblk_lock(inode->i_sb); ++ qmblk_data_write_lock(qmblk); ++ qlnk->qugid[USRQUOTA] = quid; ++ qlnk->qugid[GRPQUOTA] = qgid; ++ return 1; ++ } ++ ++ return 0; ++} ++#endif ++ ++/** ++ * __vzquota_inode_init - make sure inode's qlnk is initialized ++ * ++ * May be called if qlnk is already initialized, detects this situation itself. ++ * Called under inode_qmblk_lock. ++ */ ++static void __vzquota_inode_init(struct inode *inode, unsigned char origin) ++{ ++ if (inode->i_dquot[USRQUOTA] == NODQUOT) { ++ vzquota_qlnk_init(INODE_QLNK(inode)); ++ inode->i_dquot[USRQUOTA] = (void *)~(unsigned long)NODQUOT; ++ } ++ INODE_QLNK(inode)->origin = origin; ++} ++ ++/** ++ * vzquota_inode_drop - destroy VZ quota information in the inode ++ * ++ * Inode must not be externally accessible or dirty. ++ */ ++static void vzquota_inode_drop(struct inode *inode) ++{ ++ struct vz_quota_ilink qlnk; ++ ++ vzquota_qlnk_init(&qlnk); ++ inode_qmblk_lock(inode->i_sb); ++ vzquota_qlnk_swap(&qlnk, INODE_QLNK(inode)); ++ INODE_QLNK(inode)->origin = VZ_QUOTAO_DRCAL; ++ inode->i_dquot[USRQUOTA] = NODQUOT; ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(&qlnk); ++} ++ ++/** ++ * vzquota_inode_qmblk_set - initialize inode's qlnk ++ * @inode: inode to be initialized ++ * @qmblk: quota master block to which this inode should belong (may be BAD) ++ * @qlnk: placeholder to store data to resolve locking issues ++ * ++ * Returns 1 if locks were dropped and rechecks possibly needed, 0 otherwise. ++ * Called under dcache_lock and inode_qmblk locks. ++ * @qlnk will be destroyed in the caller chain. ++ * ++ * It is not mandatory to restart parent checks since quota on/off currently ++ * shrinks dentry tree and checks that there are not outside references. ++ * But if at some time that shink is removed, restarts will be required. ++ * Additionally, the restarts prevent inconsistencies if the dentry tree ++ * changes (inode is moved). This is not a big deal, but anyway... ++ */ ++static int vzquota_inode_qmblk_set(struct inode *inode, ++ struct vz_quota_master *qmblk, ++ struct vz_quota_ilink *qlnk) ++{ ++ if (qmblk == NULL) { ++ printk(KERN_ERR "VZDQ: NULL in set, " ++ "orig %u, dev %s, inode %lu, fs %s\n", ++ INODE_QLNK(inode)->origin, ++ inode->i_sb->s_id, inode->i_ino, ++ inode->i_sb->s_type->name); ++ printk(KERN_ERR "current %d (%s), VE %d\n", ++ current->pid, current->comm, ++ VEID(get_exec_env())); ++ dump_stack(); ++ qmblk = VZ_QUOTA_BAD; ++ } ++ while (1) { ++ if (vzquota_qlnk_is_empty(qlnk) && ++ vzquota_qlnk_fill(qlnk, inode, qmblk)) ++ return 1; ++ if (qlnk->qmblk == qmblk) ++ break; ++ if (vzquota_qlnk_reinit_locked(qlnk, inode)) ++ return 1; ++ } ++ vzquota_qlnk_swap(qlnk, INODE_QLNK(inode)); ++ INODE_QLNK(inode)->origin = VZ_QUOTAO_QSET; ++ return 0; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * vzquota_inode_qmblk (inode -> qmblk lookup) parts ++ * ++ * --------------------------------------------------------------------- */ ++ ++static int vzquota_dparents_check_attach(struct inode *inode) ++{ ++ if (!list_empty(&inode->i_dentry)) ++ return 0; ++ printk(KERN_ERR "VZDQ: no parent for " ++ "dev %s, inode %lu, fs %s\n", ++ inode->i_sb->s_id, ++ inode->i_ino, ++ inode->i_sb->s_type->name); ++ return -1; ++} ++ ++static struct inode *vzquota_dparents_check_actual(struct inode *inode) ++{ ++ struct dentry *de; ++ ++ list_for_each_entry(de, &inode->i_dentry, d_alias) { ++ if (de->d_parent == de) /* detached dentry, perhaps */ ++ continue; ++ /* first access to parent, make sure its qlnk initialized */ ++ __vzquota_inode_init(de->d_parent->d_inode, VZ_QUOTAO_ACT); ++ if (!VZ_QUOTA_IS_ACTUAL(de->d_parent->d_inode)) ++ return de->d_parent->d_inode; ++ } ++ return NULL; ++} ++ ++static struct vz_quota_master *vzquota_dparents_check_same(struct inode *inode) ++{ ++ struct dentry *de; ++ struct vz_quota_master *qmblk; ++ ++ qmblk = NULL; ++ list_for_each_entry(de, &inode->i_dentry, d_alias) { ++ if (de->d_parent == de) /* detached dentry, perhaps */ ++ continue; ++ if (qmblk == NULL) { ++ qmblk = INODE_QLNK(de->d_parent->d_inode)->qmblk; ++ continue; ++ } ++ if (INODE_QLNK(de->d_parent->d_inode)->qmblk != qmblk) { ++ printk(KERN_WARNING "VZDQ: multiple quotas for " ++ "dev %s, inode %lu, fs %s\n", ++ inode->i_sb->s_id, ++ inode->i_ino, ++ inode->i_sb->s_type->name); ++ qmblk = VZ_QUOTA_BAD; ++ break; ++ } ++ } ++ if (qmblk == NULL) { ++ printk(KERN_WARNING "VZDQ: not attached to tree, " ++ "dev %s, inode %lu, fs %s\n", ++ inode->i_sb->s_id, ++ inode->i_ino, ++ inode->i_sb->s_type->name); ++ qmblk = VZ_QUOTA_BAD; ++ } ++ return qmblk; ++} ++ ++static void vzquota_dbranch_actualize(struct inode *inode, ++ struct inode *refinode) ++{ ++ struct inode *pinode; ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ilink qlnk; ++ ++ vzquota_qlnk_init(&qlnk); ++ ++start: ++ if (inode == inode->i_sb->s_root->d_inode) { ++ /* filesystem root */ ++ atomic_inc(&inode->i_count); ++ do { ++ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); ++ } while (vzquota_inode_qmblk_set(inode, qmblk, &qlnk)); ++ goto out; ++ } ++ ++ if (!vzquota_dparents_check_attach(inode)) { ++ pinode = vzquota_dparents_check_actual(inode); ++ if (pinode != NULL) { ++ inode = pinode; ++ goto start; ++ } ++ } ++ ++ atomic_inc(&inode->i_count); ++ while (1) { ++ if (VZ_QUOTA_IS_ACTUAL(inode)) /* actualized without us */ ++ break; ++ /* ++ * Need to check parents again if we have slept inside ++ * vzquota_inode_qmblk_set() in the loop. ++ * If the state of parents is different, just return and repeat ++ * the actualizing process again from the inode passed to ++ * vzquota_inode_qmblk_recalc(). ++ */ ++ if (!vzquota_dparents_check_attach(inode)) { ++ if (vzquota_dparents_check_actual(inode) != NULL) ++ break; ++ qmblk = vzquota_dparents_check_same(inode); ++ } else ++ qmblk = VZ_QUOTA_BAD; ++ if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)){/* success */ ++ INODE_QLNK(inode)->origin = VZ_QUOTAO_ACT; ++ break; ++ } ++ } ++ ++out: ++ spin_unlock(&dcache_lock); ++ inode_qmblk_unlock(refinode->i_sb); ++ vzquota_qlnk_destroy(&qlnk); ++ iput(inode); ++ inode_qmblk_lock(refinode->i_sb); ++ spin_lock(&dcache_lock); ++} ++ ++static void vzquota_dtree_qmblk_recalc(struct inode *inode, ++ struct vz_quota_ilink *qlnk) ++{ ++ struct inode *pinode; ++ struct vz_quota_master *qmblk; ++ ++ if (inode == inode->i_sb->s_root->d_inode) { ++ /* filesystem root */ ++ do { ++ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); ++ } while (vzquota_inode_qmblk_set(inode, qmblk, qlnk)); ++ return; ++ } ++ ++start: ++ if (VZ_QUOTA_IS_ACTUAL(inode)) ++ return; ++ /* ++ * Here qmblk is (re-)initialized for all ancestors. ++ * This is not a very efficient procedure, but it guarantees that ++ * the quota tree is consistent (that is, the inode doesn't have two ++ * ancestors with different qmblk). ++ */ ++ if (!vzquota_dparents_check_attach(inode)) { ++ pinode = vzquota_dparents_check_actual(inode); ++ if (pinode != NULL) { ++ vzquota_dbranch_actualize(pinode, inode); ++ goto start; ++ } ++ qmblk = vzquota_dparents_check_same(inode); ++ } else ++ qmblk = VZ_QUOTA_BAD; ++ ++ if (vzquota_inode_qmblk_set(inode, qmblk, qlnk)) ++ goto start; ++ INODE_QLNK(inode)->origin = VZ_QUOTAO_DTREE; ++} ++ ++static void vzquota_det_qmblk_recalc(struct inode *inode, ++ struct vz_quota_ilink *qlnk) ++{ ++ struct inode *parent; ++ struct vz_quota_master *qmblk; ++ char *msg; ++ int cnt; ++ time_t timeout; ++ ++ cnt = 0; ++ parent = NULL; ++start: ++ /* ++ * qmblk of detached inodes shouldn't be considered as not actual. ++ * They are not in any dentry tree, so quota on/off shouldn't affect ++ * them. ++ */ ++ if (!vzquota_qlnk_is_empty(INODE_QLNK(inode))) ++ return; ++ ++ timeout = 3; ++ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); ++ msg = "detached inode not in creation"; ++ if (inode->i_op != VZ_QUOTA_EMPTY_IOPS) ++ goto fail; ++ qmblk = VZ_QUOTA_BAD; ++ msg = "unexpected creation context"; ++ if (!vzquota_cur_qmblk_check()) ++ goto fail; ++ timeout = 0; ++ parent = vzquota_cur_qmblk_fetch(); ++ msg = "uninitialized parent"; ++ if (vzquota_qlnk_is_empty(INODE_QLNK(parent))) ++ goto fail; ++ msg = "parent not in tree"; ++ if (list_empty(&parent->i_dentry)) ++ goto fail; ++ msg = "parent has 0 refcount"; ++ if (!atomic_read(&parent->i_count)) ++ goto fail; ++ msg = "parent has different sb"; ++ if (parent->i_sb != inode->i_sb) ++ goto fail; ++ if (!VZ_QUOTA_IS_ACTUAL(parent)) { ++ vzquota_dbranch_actualize(parent, inode); ++ goto start; ++ } ++ ++ qmblk = INODE_QLNK(parent)->qmblk; ++set: ++ if (vzquota_inode_qmblk_set(inode, qmblk, qlnk)) ++ goto start; ++ INODE_QLNK(inode)->origin = VZ_QUOTAO_DET; ++ return; ++ ++fail: ++ { ++ struct timeval tv, tvo; ++ do_gettimeofday(&tv); ++ memcpy(&tvo, __VZ_QUOTA_TSTAMP(inode->i_sb), sizeof(tvo)); ++ tv.tv_sec -= tvo.tv_sec; ++ if (tv.tv_usec < tvo.tv_usec) { ++ tv.tv_sec--; ++ tv.tv_usec += USEC_PER_SEC - tvo.tv_usec; ++ } else ++ tv.tv_usec -= tvo.tv_usec; ++ if (tv.tv_sec < timeout) ++ goto set; ++ printk(KERN_ERR "VZDQ: %s, orig %u," ++ " dev %s, inode %lu, fs %s\n", ++ msg, INODE_QLNK(inode)->origin, ++ inode->i_sb->s_id, inode->i_ino, ++ inode->i_sb->s_type->name); ++ if (!cnt++) { ++ printk(KERN_ERR "current %d (%s), VE %d," ++ " time %ld.%06ld\n", ++ current->pid, current->comm, ++ VEID(get_exec_env()), ++ tv.tv_sec, tv.tv_usec); ++ dump_stack(); ++ } ++ if (parent != NULL) ++ printk(KERN_ERR "VZDQ: parent of %lu is %lu\n", ++ inode->i_ino, parent->i_ino); ++ } ++ goto set; ++} ++ ++static void vzquota_inode_qmblk_recalc(struct inode *inode, ++ struct vz_quota_ilink *qlnk) ++{ ++ spin_lock(&dcache_lock); ++ if (!list_empty(&inode->i_dentry)) ++ vzquota_dtree_qmblk_recalc(inode, qlnk); ++ else ++ vzquota_det_qmblk_recalc(inode, qlnk); ++ spin_unlock(&dcache_lock); ++} ++ ++/** ++ * vzquota_inode_qmblk - obtain inode's qmblk ++ * ++ * Returns qmblk with refcounter taken, %NULL if not under ++ * VZ quota or %VZ_QUOTA_BAD. ++ * ++ * FIXME: This function should be removed when vzquota_find_qmblk / ++ * get_quota_root / vzquota_dstat code is cleaned up. ++ */ ++struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ilink qlnk; ++ ++ might_sleep(); ++ ++ if (inode->i_sb->dq_op != &vz_quota_operations) ++ return NULL; ++#if defined(VZ_QUOTA_UNLOAD) ++#error Make sure qmblk does not disappear ++#endif ++ ++ vzquota_qlnk_init(&qlnk); ++ inode_qmblk_lock(inode->i_sb); ++ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); ++ ++ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || ++ !VZ_QUOTA_IS_ACTUAL(inode)) ++ vzquota_inode_qmblk_recalc(inode, &qlnk); ++ ++ qmblk = INODE_QLNK(inode)->qmblk; ++ if (qmblk != VZ_QUOTA_BAD) { ++ if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) ++ qmblk_get(qmblk); ++ else ++ qmblk = NULL; ++ } ++ ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(&qlnk); ++ return qmblk; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Calls from quota operations ++ * ++ * --------------------------------------------------------------------- */ ++ ++/** ++ * vzquota_inode_init_call - call from DQUOT_INIT ++ */ ++void vzquota_inode_init_call(struct inode *inode) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ ++ /* initializes inode's quota inside */ ++ qmblk = vzquota_inode_data(inode, &data); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ vzquota_data_unlock(inode, &data); ++ ++ /* ++ * The check is needed for repeated new_inode() calls from a single ++ * ext3 call like create or mkdir in case of -ENOSPC. ++ */ ++ spin_lock(&dcache_lock); ++ if (!list_empty(&inode->i_dentry)) ++ vzquota_cur_qmblk_set(inode); ++ spin_unlock(&dcache_lock); ++} ++ ++/** ++ * vzquota_inode_drop_call - call from DQUOT_DROP ++ */ ++void vzquota_inode_drop_call(struct inode *inode) ++{ ++ vzquota_inode_drop(inode); ++} ++ ++/** ++ * vzquota_inode_data - initialize (if nec.) and lock inode quota ptrs ++ * @inode: the inode ++ * @data: storage space ++ * ++ * Returns: qmblk is NULL or VZ_QUOTA_BAD or actualized qmblk. ++ * On return if qmblk is neither NULL nor VZ_QUOTA_BAD: ++ * qmblk in inode's qlnk is the same as returned, ++ * ugid pointers inside inode's qlnk are valid, ++ * some locks are taken (and should be released by vzquota_data_unlock). ++ * If qmblk is NULL or VZ_QUOTA_BAD, locks are NOT taken. ++ */ ++struct vz_quota_master *vzquota_inode_data(struct inode *inode, ++ struct vz_quota_datast *data) ++{ ++ struct vz_quota_master *qmblk; ++ ++ might_sleep(); ++ ++ vzquota_qlnk_init(&data->qlnk); ++ inode_qmblk_lock(inode->i_sb); ++ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); ++ ++ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || ++ !VZ_QUOTA_IS_ACTUAL(inode)) ++ vzquota_inode_qmblk_recalc(inode, &data->qlnk); ++ ++ qmblk = INODE_QLNK(inode)->qmblk; ++ if (qmblk != VZ_QUOTA_BAD) { ++ if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) { ++ /* ++ * Note that in the current implementation, ++ * inode_qmblk_lock can theoretically be dropped here. ++ * This place is serialized with quota_off because ++ * quota_off fails when there are extra dentry ++ * references and syncs inodes before removing quota ++ * information from them. ++ * However, quota usage information should stop being ++ * updated immediately after vzquota_off. ++ */ ++ qmblk_data_write_lock(qmblk); ++ } else { ++ inode_qmblk_unlock(inode->i_sb); ++ qmblk = NULL; ++ } ++ } else { ++ inode_qmblk_unlock(inode->i_sb); ++ } ++ return qmblk; ++} ++ ++void vzquota_data_unlock(struct inode *inode, ++ struct vz_quota_datast *data) ++{ ++ qmblk_data_write_unlock(INODE_QLNK(inode)->qmblk); ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(&data->qlnk); ++} ++ ++#if defined(CONFIG_VZ_QUOTA_UGID) ++/** ++ * vzquota_inode_transfer_call - call from vzquota_transfer ++ */ ++int vzquota_inode_transfer_call(struct inode *inode, struct iattr *iattr) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ struct vz_quota_ilink qlnew; ++ int mask; ++ int ret; ++ ++ might_sleep(); ++ vzquota_qlnk_init(&qlnew); ++start: ++ qmblk = vzquota_inode_data(inode, &data); ++ ret = NO_QUOTA; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out_destr; ++ ret = QUOTA_OK; ++ if (qmblk == NULL) ++ goto out_destr; ++ qmblk_get(qmblk); ++ ++ ret = QUOTA_OK; ++ if (!(qmblk->dq_flags & VZDQUG_ON)) ++ /* no ugid quotas */ ++ goto out_unlock; ++ ++ mask = 0; ++ if ((iattr->ia_valid & ATTR_UID) && iattr->ia_uid != inode->i_uid) ++ mask |= 1 << USRQUOTA; ++ if ((iattr->ia_valid & ATTR_GID) && iattr->ia_gid != inode->i_gid) ++ mask |= 1 << GRPQUOTA; ++ while (1) { ++ if (vzquota_qlnk_is_empty(&qlnew) && ++ vzquota_qlnk_fill_attr(&qlnew, inode, iattr, mask, qmblk)) ++ break; ++ if (qlnew.qmblk == INODE_QLNK(inode)->qmblk && ++ qlnew.qmblk == qmblk) ++ goto finish; ++ if (vzquota_qlnk_reinit_attr(&qlnew, inode, qmblk)) ++ break; ++ } ++ ++ /* prepare for restart */ ++ vzquota_data_unlock(inode, &data); ++ qmblk_put(qmblk); ++ goto start; ++ ++finish: ++ /* all references obtained successfully */ ++ ret = vzquota_transfer_usage(inode, mask, &qlnew); ++ if (!ret) { ++ vzquota_qlnk_swap(&qlnew, INODE_QLNK(inode)); ++ INODE_QLNK(inode)->origin = VZ_QUOTAO_TRANS; ++ } ++out_unlock: ++ vzquota_data_unlock(inode, &data); ++ qmblk_put(qmblk); ++out_destr: ++ vzquota_qlnk_destroy(&qlnew); ++ return ret; ++} ++#endif ++ ++int vzquota_rename_check(struct inode *inode, ++ struct inode *old_dir, struct inode *new_dir) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ilink qlnk1, qlnk2; ++ int c, ret; ++ ++ if (inode->i_sb != old_dir->i_sb || inode->i_sb != new_dir->i_sb) ++ return -1; ++ ++ might_sleep(); ++ ++ vzquota_qlnk_init(&qlnk1); ++ vzquota_qlnk_init(&qlnk2); ++ inode_qmblk_lock(inode->i_sb); ++ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); ++ __vzquota_inode_init(old_dir, VZ_QUOTAO_INICAL); ++ __vzquota_inode_init(new_dir, VZ_QUOTAO_INICAL); ++ ++ do { ++ c = 0; ++ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || ++ !VZ_QUOTA_IS_ACTUAL(inode)) { ++ vzquota_inode_qmblk_recalc(inode, &qlnk1); ++ c++; ++ } ++ if (vzquota_qlnk_is_empty(INODE_QLNK(new_dir)) || ++ !VZ_QUOTA_IS_ACTUAL(new_dir)) { ++ vzquota_inode_qmblk_recalc(new_dir, &qlnk2); ++ c++; ++ } ++ } while (c); ++ ++ ret = 0; ++ qmblk = INODE_QLNK(inode)->qmblk; ++ if (qmblk != INODE_QLNK(new_dir)->qmblk) { ++ ret = -1; ++ if (qmblk != VZ_QUOTA_BAD && ++ !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) && ++ qmblk->dq_root_dentry->d_inode == inode && ++ VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(new_dir)->qmblk, ++ inode->i_sb) && ++ VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(old_dir)->qmblk, ++ inode->i_sb)) ++ /* quota root rename is allowed */ ++ ret = 0; ++ } ++ ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(&qlnk2); ++ vzquota_qlnk_destroy(&qlnk1); ++ return ret; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * qmblk-related parts of on/off operations ++ * ++ * --------------------------------------------------------------------- */ ++ ++/** ++ * vzquota_check_dtree - check dentry tree if quota on/off is allowed ++ * ++ * This function doesn't allow quota to be turned on/off if some dentries in ++ * the tree have external references. ++ * In addition to technical reasons, it enforces user-space correctness: ++ * current usage (taken from or reported to the user space) can be meaningful ++ * and accurate only if the tree is not being modified. ++ * Side effect: additional vfsmount structures referencing the tree (bind ++ * mounts of tree nodes to some other places) are not allowed at on/off time. ++ */ ++int vzquota_check_dtree(struct vz_quota_master *qmblk, int off) ++{ ++ struct dentry *dentry; ++ int err, count; ++ ++ err = -EBUSY; ++ dentry = qmblk->dq_root_dentry; ++ ++ if (d_unhashed(dentry)) ++ goto unhashed; ++ ++ /* attempt to shrink */ ++ if (!list_empty(&dentry->d_subdirs)) { ++ spin_unlock(&dcache_lock); ++ inode_qmblk_unlock(dentry->d_sb); ++ shrink_dcache_parent(dentry); ++ inode_qmblk_lock(dentry->d_sb); ++ spin_lock(&dcache_lock); ++ if (!list_empty(&dentry->d_subdirs)) ++ goto out; ++ ++ count = 1; ++ if (dentry == dentry->d_sb->s_root) ++ count += 2; /* sb and mnt refs */ ++ if (atomic_read(&dentry->d_count) < count) { ++ printk(KERN_ERR "%s: too small count %d vs %d.\n", ++ __FUNCTION__, ++ atomic_read(&dentry->d_count), count); ++ goto out; ++ } ++ if (atomic_read(&dentry->d_count) > count) ++ goto out; ++ } ++ ++ err = 0; ++out: ++ return err; ++ ++unhashed: ++ /* ++ * Quota root is removed. ++ * Allow to turn quota off, but not on. ++ */ ++ if (off) ++ err = 0; ++ goto out; ++} ++ ++int vzquota_on_qmblk(struct super_block *sb, struct inode *inode, ++ struct vz_quota_master *qmblk) ++{ ++ struct vz_quota_ilink qlnk; ++ struct vz_quota_master *qold, *qnew; ++ int err; ++ ++ might_sleep(); ++ ++ qold = NULL; ++ qnew = vzquota_alloc_fake(); ++ if (qnew == NULL) ++ return -ENOMEM; ++ ++ vzquota_qlnk_init(&qlnk); ++ inode_qmblk_lock(sb); ++ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); ++ ++ spin_lock(&dcache_lock); ++ while (1) { ++ err = vzquota_check_dtree(qmblk, 0); ++ if (err) ++ break; ++ if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)) ++ break; ++ } ++ INODE_QLNK(inode)->origin = VZ_QUOTAO_ON; ++ spin_unlock(&dcache_lock); ++ ++ if (!err) { ++ qold = __VZ_QUOTA_NOQUOTA(sb); ++ qold->dq_flags |= VZDQ_NOACT; ++ __VZ_QUOTA_NOQUOTA(sb) = qnew; ++ } ++ ++ inode_qmblk_unlock(sb); ++ vzquota_qlnk_destroy(&qlnk); ++ if (qold != NULL) ++ qmblk_put(qold); ++ ++ return err; ++} ++ ++int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk) ++{ ++ int ret; ++ ++ ret = 0; ++ inode_qmblk_lock(sb); ++ ++ spin_lock(&dcache_lock); ++ if (vzquota_check_dtree(qmblk, 1)) ++ ret = -EBUSY; ++ spin_unlock(&dcache_lock); ++ ++ if (!ret) ++ qmblk->dq_flags |= VZDQ_NOACT | VZDQ_NOQUOT; ++ inode_qmblk_unlock(sb); ++ return ret; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * External interfaces ++ * ++ * ---------------------------------------------------------------------*/ ++ ++static int vzquota_ioctl(struct inode *ino, struct file *file, ++ unsigned int cmd, unsigned long arg) ++{ ++ int err; ++ struct vzctl_quotactl qb; ++ struct vzctl_quotaugidctl qub; ++ ++ switch (cmd) { ++ case VZCTL_QUOTA_CTL: ++ err = -ENOTTY; ++ break; ++ case VZCTL_QUOTA_NEW_CTL: ++ err = -EFAULT; ++ if (copy_from_user(&qb, (void *)arg, sizeof(qb))) ++ break; ++ err = do_vzquotactl(qb.cmd, qb.quota_id, ++ qb.qstat, qb.ve_root); ++ break; ++#ifdef CONFIG_VZ_QUOTA_UGID ++ case VZCTL_QUOTA_UGID_CTL: ++ err = -EFAULT; ++ if (copy_from_user(&qub, (void *)arg, sizeof(qub))) ++ break; ++ err = do_vzquotaugidctl(&qub); ++ break; ++#endif ++ default: ++ err = -ENOTTY; ++ } ++ might_sleep(); /* debug */ ++ return err; ++} ++ ++static struct vzioctlinfo vzdqcalls = { ++ .type = VZDQCTLTYPE, ++ .func = vzquota_ioctl, ++ .owner = THIS_MODULE, ++}; ++ ++/** ++ * vzquota_dstat - get quota usage info for virtual superblock ++ */ ++static int vzquota_dstat(struct super_block *super, struct dq_stat *qstat) ++{ ++ struct vz_quota_master *qmblk; ++ ++ qmblk = vzquota_find_qmblk(super); ++ if (qmblk == NULL) ++ return -ENOENT; ++ if (qmblk == VZ_QUOTA_BAD) { ++ memset(qstat, 0, sizeof(*qstat)); ++ return 0; ++ } ++ ++ qmblk_data_read_lock(qmblk); ++ memcpy(qstat, &qmblk->dq_stat, sizeof(*qstat)); ++ qmblk_data_read_unlock(qmblk); ++ qmblk_put(qmblk); ++ return 0; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Init/exit helpers ++ * ++ * ---------------------------------------------------------------------*/ ++ ++static int vzquota_cache_init(void) ++{ ++ int i; ++ ++ vzquota_cachep = kmem_cache_create("vz_quota_master", ++ sizeof(struct vz_quota_master), ++ 0, SLAB_HWCACHE_ALIGN, NULL, NULL); ++ if (vzquota_cachep == NULL) { ++ printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n"); ++ goto nomem2; ++ } ++ for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&vzquota_hash_table[i]); ++ ++ return 0; ++ ++nomem2: ++ return -ENOMEM; ++} ++ ++static void vzquota_cache_release(void) ++{ ++ int i; ++ ++ /* sanity check */ ++ for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++) ++ if (!list_empty(&vzquota_hash_table[i])) ++ BUG(); ++ ++ /* release caches */ ++ if (kmem_cache_destroy(vzquota_cachep)) ++ printk(KERN_ERR ++ "VZQUOTA: vz_quota_master kmem_cache_destroy failed\n"); ++ vzquota_cachep = NULL; ++} ++ ++static int quota_notifier_call(struct vnotifier_block *self, ++ unsigned long n, void *data, int err) ++{ ++ struct virt_info_quota *viq; ++ struct super_block *sb; ++ ++ viq = (struct virt_info_quota *)data; ++ switch (n) { ++ case VIRTINFO_QUOTA_ON: ++ err = NOTIFY_BAD; ++ if (!try_module_get(THIS_MODULE)) ++ break; ++ sb = viq->super; ++ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); ++ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); ++ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); ++ err = NOTIFY_OK; ++ break; ++ case VIRTINFO_QUOTA_OFF: ++ module_put(THIS_MODULE); ++ err = NOTIFY_OK; ++ break; ++ case VIRTINFO_QUOTA_GETSTAT: ++ err = NOTIFY_BAD; ++ if (vzquota_dstat(viq->super, viq->qstat)) ++ break; ++ err = NOTIFY_OK; ++ break; ++ } ++ return err; ++} ++ ++struct vnotifier_block quota_notifier_block = { ++ .notifier_call = quota_notifier_call, ++ .priority = INT_MAX, ++}; ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Init/exit procedures ++ * ++ * ---------------------------------------------------------------------*/ ++ ++static int __init vzquota_init(void) ++{ ++ int err; ++ ++ if ((err = vzquota_cache_init()) != 0) ++ goto out_cache; ++ ++ if ((err = vzquota_proc_init()) != 0) ++ goto out_proc; ++ ++#ifdef CONFIG_VZ_QUOTA_UGID ++ if ((err = vzquota_ugid_init()) != 0) ++ goto out_ugid; ++#endif ++ ++ init_MUTEX(&vz_quota_sem); ++ vzioctl_register(&vzdqcalls); ++ virtinfo_notifier_register(VITYPE_QUOTA, "a_notifier_block); ++#if defined(CONFIG_VZ_QUOTA_UGID) && defined(CONFIG_PROC_FS) ++ vzaquota_init(); ++#endif ++ ++ return 0; ++ ++#ifdef CONFIG_VZ_QUOTA_UGID ++out_ugid: ++ vzquota_proc_release(); ++#endif ++out_proc: ++ vzquota_cache_release(); ++out_cache: ++ return err; ++} ++ ++#if defined(VZ_QUOTA_UNLOAD) ++static void __exit vzquota_release(void) ++{ ++ virtinfo_notifier_unregister(VITYPE_QUOTA, "a_notifier_block); ++ vzioctl_unregister(&vzdqcalls); ++#ifdef CONFIG_VZ_QUOTA_UGID ++#ifdef CONFIG_PROC_FS ++ vzaquota_fini(); ++#endif ++ vzquota_ugid_release(); ++#endif ++ vzquota_proc_release(); ++ vzquota_cache_release(); ++} ++#endif ++ ++MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); ++MODULE_DESCRIPTION("Virtuozzo Disk Quota"); ++MODULE_LICENSE("GPL v2"); ++ ++module_init(vzquota_init) ++#if defined(VZ_QUOTA_UNLOAD) ++module_exit(vzquota_release) ++#endif +diff -uprN linux-2.6.15.orig/include/asm-i386/elf.h linux-2.6.15-ve025stab014/include/asm-i386/elf.h +--- linux-2.6.15.orig/include/asm-i386/elf.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/asm-i386/elf.h 2006-01-27 14:48:08.000000000 +0300 +@@ -108,7 +108,7 @@ typedef struct user_fxsr_struct elf_fpxr + For the moment, we have only optimizations for the Intel generations, + but that could change... */ + +-#define ELF_PLATFORM (system_utsname.machine) ++#define ELF_PLATFORM (ve_utsname.machine) + + #ifdef __KERNEL__ + #define SET_PERSONALITY(ex, ibcs2) do { } while (0) +diff -uprN linux-2.6.15.orig/include/asm-i386/mman.h linux-2.6.15-ve025stab014/include/asm-i386/mman.h +--- linux-2.6.15.orig/include/asm-i386/mman.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/asm-i386/mman.h 2006-01-27 14:48:05.000000000 +0300 +@@ -22,6 +22,7 @@ + #define MAP_NORESERVE 0x4000 /* don't check for reservations */ + #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ + #define MAP_NONBLOCK 0x10000 /* do not block on IO */ ++#define MAP_EXECPRIO 0x20000 /* do soft ubc charge */ + + #define MS_ASYNC 1 /* sync memory asynchronously */ + #define MS_INVALIDATE 2 /* invalidate the caches */ +diff -uprN linux-2.6.15.orig/include/asm-i386/timex.h linux-2.6.15-ve025stab014/include/asm-i386/timex.h +--- linux-2.6.15.orig/include/asm-i386/timex.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/asm-i386/timex.h 2006-01-27 14:48:08.000000000 +0300 +@@ -36,13 +36,17 @@ static inline cycles_t get_cycles (void) + { + unsigned long long ret=0; + +-#ifndef CONFIG_X86_TSC +- if (!cpu_has_tsc) +- return 0; +-#endif +- + #if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC) + rdtscll(ret); ++#elif defined(CONFIG_VE) ++ /* ++ * get_cycles is used in the following calculations: ++ * - VPS idle and iowait times in kernel/shced.h ++ * - task's sleep time to be shown with SyRq-t ++ * - kstat latencies in linux/vzstat.h ++ * - sched latency via wakeup_stamp in linux/ve_task.h ++ */ ++#warning "some of VPS statistics won't be correct without get_cycles() (kstat_lat, ve_idle, etc)" + #endif + return ret; + } +diff -uprN linux-2.6.15.orig/include/asm-i386/unistd.h linux-2.6.15-ve025stab014/include/asm-i386/unistd.h +--- linux-2.6.15.orig/include/asm-i386/unistd.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/asm-i386/unistd.h 2006-01-27 14:48:05.000000000 +0300 +@@ -299,8 +299,11 @@ + #define __NR_inotify_init 291 + #define __NR_inotify_add_watch 292 + #define __NR_inotify_rm_watch 293 +- +-#define NR_syscalls 294 ++#define __NR_getluid 510 ++#define __NR_setluid 511 ++#define __NR_setublimit 512 ++#define __NR_ubstat 513 ++#define NR_syscalls 513 + + /* + * user-visible error numbers are in the range -1 - -128: see +diff -uprN linux-2.6.15.orig/include/asm-ia64/mman.h linux-2.6.15-ve025stab014/include/asm-ia64/mman.h +--- linux-2.6.15.orig/include/asm-ia64/mman.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/asm-ia64/mman.h 2006-01-27 14:48:05.000000000 +0300 +@@ -30,6 +30,7 @@ + #define MAP_NORESERVE 0x04000 /* don't check for reservations */ + #define MAP_POPULATE 0x08000 /* populate (prefault) pagetables */ + #define MAP_NONBLOCK 0x10000 /* do not block on IO */ ++#define MAP_EXECPRIO 0x20000 /* soft ubc charge */ + + #define MS_ASYNC 1 /* sync memory asynchronously */ + #define MS_INVALIDATE 2 /* invalidate the caches */ +diff -uprN linux-2.6.15.orig/include/asm-ia64/pgalloc.h linux-2.6.15-ve025stab014/include/asm-ia64/pgalloc.h +--- linux-2.6.15.orig/include/asm-ia64/pgalloc.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/asm-ia64/pgalloc.h 2006-01-27 14:48:05.000000000 +0300 +@@ -20,6 +20,8 @@ + #include <linux/page-flags.h> + #include <linux/threads.h> + ++#include <ub/ub_mem.h> ++ + #include <asm/mmu_context.h> + + DECLARE_PER_CPU(unsigned long *, __pgtable_quicklist); +@@ -38,7 +40,7 @@ static inline long pgtable_quicklist_tot + return ql_size; + } + +-static inline void *pgtable_quicklist_alloc(void) ++static inline void *pgtable_quicklist_alloc(int charge) + { + unsigned long *ret = NULL; + +@@ -46,13 +48,19 @@ static inline void *pgtable_quicklist_al + + ret = pgtable_quicklist; + if (likely(ret != NULL)) { ++ if (ub_page_charge(virt_to_page(ret), 0, ++ charge ? __GFP_UBC|__GFP_SOFT_UBC : 0)) ++ goto out; ++ + pgtable_quicklist = (unsigned long *)(*ret); + ret[0] = 0; + --pgtable_quicklist_size; ++out: + preempt_enable(); + } else { + preempt_enable(); +- ret = (unsigned long *)__get_free_page(GFP_KERNEL | __GFP_ZERO); ++ ret = (unsigned long *)__get_free_page(GFP_KERNEL | __GFP_ZERO | ++ (charge ? __GFP_UBC | __GFP_SOFT_UBC : 0)); + } + + return ret; +@@ -70,6 +78,7 @@ static inline void pgtable_quicklist_fre + #endif + + preempt_disable(); ++ ub_page_uncharge(virt_to_page(pgtable_entry), 0); + *(unsigned long *)pgtable_entry = (unsigned long)pgtable_quicklist; + pgtable_quicklist = (unsigned long *)pgtable_entry; + ++pgtable_quicklist_size; +@@ -78,7 +87,7 @@ static inline void pgtable_quicklist_fre + + static inline pgd_t *pgd_alloc(struct mm_struct *mm) + { +- return pgtable_quicklist_alloc(); ++ return pgtable_quicklist_alloc(1); + } + + static inline void pgd_free(pgd_t * pgd) +@@ -95,7 +104,7 @@ pgd_populate(struct mm_struct *mm, pgd_t + + static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) + { +- return pgtable_quicklist_alloc(); ++ return pgtable_quicklist_alloc(1); + } + + static inline void pud_free(pud_t * pud) +@@ -113,7 +122,7 @@ pud_populate(struct mm_struct *mm, pud_t + + static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) + { +- return pgtable_quicklist_alloc(); ++ return pgtable_quicklist_alloc(1); + } + + static inline void pmd_free(pmd_t * pmd) +@@ -138,13 +147,13 @@ pmd_populate_kernel(struct mm_struct *mm + static inline struct page *pte_alloc_one(struct mm_struct *mm, + unsigned long addr) + { +- return virt_to_page(pgtable_quicklist_alloc()); ++ return virt_to_page(pgtable_quicklist_alloc(1)); + } + + static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, + unsigned long addr) + { +- return pgtable_quicklist_alloc(); ++ return pgtable_quicklist_alloc(0); + } + + static inline void pte_free(struct page *pte) +diff -uprN linux-2.6.15.orig/include/asm-ia64/processor.h linux-2.6.15-ve025stab014/include/asm-ia64/processor.h +--- linux-2.6.15.orig/include/asm-ia64/processor.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/asm-ia64/processor.h 2006-01-27 14:48:08.000000000 +0300 +@@ -306,7 +306,7 @@ struct thread_struct { + regs->loadrs = 0; \ + regs->r8 = current->mm->dumpable; /* set "don't zap registers" flag */ \ + regs->r12 = new_sp - 16; /* allocate 16 byte scratch area */ \ +- if (unlikely(!current->mm->dumpable)) { \ ++ if (unlikely(!current->mm->dumpable || !current->mm->vps_dumpable)) { \ + /* \ + * Zap scratch regs to avoid leaking bits between processes with different \ + * uid/privileges. \ +diff -uprN linux-2.6.15.orig/include/asm-ia64/unistd.h linux-2.6.15-ve025stab014/include/asm-ia64/unistd.h +--- linux-2.6.15.orig/include/asm-ia64/unistd.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/asm-ia64/unistd.h 2006-01-27 14:48:05.000000000 +0300 +@@ -269,12 +269,17 @@ + #define __NR_inotify_init 1277 + #define __NR_inotify_add_watch 1278 + #define __NR_inotify_rm_watch 1279 ++#define __NR_getluid 1505 ++#define __NR_setluid 1506 ++#define __NR_setublimit 1507 ++#define __NR_ubstat 1508 + + #ifdef __KERNEL__ + + #include <linux/config.h> + +-#define NR_syscalls 256 /* length of syscall table */ ++/* length of syscall table */ ++#define NR_syscalls (__NR_ubstat - __NR_ni_syscall + 1) + + #define __ARCH_WANT_SYS_RT_SIGACTION + +diff -uprN linux-2.6.15.orig/include/asm-powerpc/pgalloc.h linux-2.6.15-ve025stab014/include/asm-powerpc/pgalloc.h +--- linux-2.6.15.orig/include/asm-powerpc/pgalloc.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/asm-powerpc/pgalloc.h 2006-01-27 14:48:05.000000000 +0300 +@@ -32,7 +32,8 @@ extern kmem_cache_t *pgtable_cache[]; + + static inline pgd_t *pgd_alloc(struct mm_struct *mm) + { +- return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL); ++ return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], ++ GFP_KERNEL_UBC | __GFP_SOFT_UBC); + } + + static inline void pgd_free(pgd_t *pgd) +@@ -47,7 +48,7 @@ static inline void pgd_free(pgd_t *pgd) + static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) + { + return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM], +- GFP_KERNEL|__GFP_REPEAT); ++ GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_REPEAT); + } + + static inline void pud_free(pud_t *pud) +@@ -83,7 +84,7 @@ static inline void pmd_populate_kernel(s + static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) + { + return kmem_cache_alloc(pgtable_cache[PMD_CACHE_NUM], +- GFP_KERNEL|__GFP_REPEAT); ++ GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_REPEAT); + } + + static inline void pmd_free(pmd_t *pmd) +@@ -91,17 +92,21 @@ static inline void pmd_free(pmd_t *pmd) + kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd); + } + ++static inline pte_t *__pte_alloc(gfp_t flags) ++{ ++ return kmem_cache_alloc(pgtable_cache[PTE_CACHE_NUM], flags); ++} ++ + static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, + unsigned long address) + { +- return kmem_cache_alloc(pgtable_cache[PTE_CACHE_NUM], +- GFP_KERNEL|__GFP_REPEAT); ++ return __pte_alloc(GFP_KERNEL | __GFP_REPEAT); + } + + static inline struct page *pte_alloc_one(struct mm_struct *mm, + unsigned long address) + { +- return virt_to_page(pte_alloc_one_kernel(mm, address)); ++ return virt_to_page(__pte_alloc(GFP_KERNEL_UBC | __GFP_SOFT_UBC)); + } + + static inline void pte_free_kernel(pte_t *pte) +diff -uprN linux-2.6.15.orig/include/asm-powerpc/unistd.h linux-2.6.15-ve025stab014/include/asm-powerpc/unistd.h +--- linux-2.6.15.orig/include/asm-powerpc/unistd.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/asm-powerpc/unistd.h 2006-01-27 14:48:05.000000000 +0300 +@@ -297,7 +297,12 @@ + #define __NR_inotify_add_watch 276 + #define __NR_inotify_rm_watch 277 + +-#define __NR_syscalls 278 ++#define __NR_getluid 410 ++#define __NR_setluid 411 ++#define __NR_setublimit 412 ++#define __NR_ubstat 413 ++ ++#define __NR_syscalls 414 + + #ifdef __KERNEL__ + #define __NR__exit __NR_exit +diff -uprN linux-2.6.15.orig/include/asm-s390/pgalloc.h linux-2.6.15-ve025stab014/include/asm-s390/pgalloc.h +--- linux-2.6.15.orig/include/asm-s390/pgalloc.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/asm-s390/pgalloc.h 2006-01-27 14:48:05.000000000 +0300 +@@ -34,12 +34,12 @@ static inline pgd_t *pgd_alloc(struct mm + int i; + + #ifndef __s390x__ +- pgd = (pgd_t *) __get_free_pages(GFP_KERNEL,1); ++ pgd = (pgd_t *) __get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC, 1); + if (pgd != NULL) + for (i = 0; i < USER_PTRS_PER_PGD; i++) + pmd_clear(pmd_offset(pgd + i, i*PGDIR_SIZE)); + #else /* __s390x__ */ +- pgd = (pgd_t *) __get_free_pages(GFP_KERNEL,2); ++ pgd = (pgd_t *) __get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC, 2); + if (pgd != NULL) + for (i = 0; i < PTRS_PER_PGD; i++) + pgd_clear(pgd + i); +@@ -72,7 +72,7 @@ static inline pmd_t * pmd_alloc_one(stru + pmd_t *pmd; + int i; + +- pmd = (pmd_t *) __get_free_pages(GFP_KERNEL, 2); ++ pmd = (pmd_t *) __get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC, 2); + if (pmd != NULL) { + for (i=0; i < PTRS_PER_PMD; i++) + pmd_clear(pmd+i); +@@ -118,16 +118,13 @@ pmd_populate(struct mm_struct *mm, pmd_t + pmd_populate_kernel(mm, pmd, (pte_t *)((page-mem_map) << PAGE_SHIFT)); + } + +-/* +- * page table entry allocation/free routines. +- */ +-static inline pte_t * +-pte_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr) ++static inline pte_t *pte_alloc(struct mm_struct *mm, unsigned long vmaddr, ++ gfp_t mask) + { + pte_t *pte; + int i; + +- pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); ++ pte = (pte_t *)__get_free_page(mask); + if (pte != NULL) { + for (i=0; i < PTRS_PER_PTE; i++) { + pte_clear(mm, vmaddr, pte+i); +@@ -137,10 +134,20 @@ pte_alloc_one_kernel(struct mm_struct *m + return pte; + } + ++/* ++ * page table entry allocation/free routines. ++ */ ++static inline pte_t * ++pte_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr) ++{ ++ return pte_alloc(mm, vmaddr, GFP_KERNEL | __GFP_REPEAT); ++} ++ + static inline struct page * + pte_alloc_one(struct mm_struct *mm, unsigned long vmaddr) + { +- pte_t *pte = pte_alloc_one_kernel(mm, vmaddr); ++ pte_t *pte = pte_alloc(mm, vmaddr, GFP_KERNEL_UBC | __GFP_SOFT_UBC | ++ __GFP_REPEAT); + if (pte) + return virt_to_page(pte); + return 0; +diff -uprN linux-2.6.15.orig/include/asm-s390/unistd.h linux-2.6.15-ve025stab014/include/asm-s390/unistd.h +--- linux-2.6.15.orig/include/asm-s390/unistd.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/asm-s390/unistd.h 2006-01-27 14:48:05.000000000 +0300 +@@ -279,8 +279,12 @@ + #define __NR_inotify_init 284 + #define __NR_inotify_add_watch 285 + #define __NR_inotify_rm_watch 286 ++#define __NR_getluid 410 ++#define __NR_setluid 411 ++#define __NR_setublimit 412 ++#define __NR_ubstat 413 + +-#define NR_syscalls 287 ++#define NR_syscalls 414 + + /* + * There are some system calls that are not present on 64 bit, some +diff -uprN linux-2.6.15.orig/include/asm-x86_64/mman.h linux-2.6.15-ve025stab014/include/asm-x86_64/mman.h +--- linux-2.6.15.orig/include/asm-x86_64/mman.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/asm-x86_64/mman.h 2006-01-27 14:48:05.000000000 +0300 +@@ -23,6 +23,7 @@ + #define MAP_NORESERVE 0x4000 /* don't check for reservations */ + #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ + #define MAP_NONBLOCK 0x10000 /* do not block on IO */ ++#define MAP_EXECPRIO 0x20000 /* soft ubc charge */ + + #define MS_ASYNC 1 /* sync memory asynchronously */ + #define MS_INVALIDATE 2 /* invalidate the caches */ +diff -uprN linux-2.6.15.orig/include/asm-x86_64/pgalloc.h linux-2.6.15-ve025stab014/include/asm-x86_64/pgalloc.h +--- linux-2.6.15.orig/include/asm-x86_64/pgalloc.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/asm-x86_64/pgalloc.h 2006-01-27 14:48:05.000000000 +0300 +@@ -31,12 +31,14 @@ static inline void pmd_free(pmd_t *pmd) + + static inline pmd_t *pmd_alloc_one (struct mm_struct *mm, unsigned long addr) + { +- return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); ++ return (pmd_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT| ++ __GFP_SOFT_UBC); + } + + static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) + { +- return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); ++ return (pud_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT| ++ __GFP_SOFT_UBC); + } + + static inline void pud_free (pud_t *pud) +@@ -48,7 +50,8 @@ static inline void pud_free (pud_t *pud) + static inline pgd_t *pgd_alloc(struct mm_struct *mm) + { + unsigned boundary; +- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); ++ pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL_UBC|__GFP_REPEAT| ++ __GFP_SOFT_UBC); + if (!pgd) + return NULL; + /* +@@ -77,7 +80,8 @@ static inline pte_t *pte_alloc_one_kerne + + static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) + { +- void *p = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); ++ void *p = (void *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT| ++ __GFP_SOFT_UBC); + if (!p) + return NULL; + return virt_to_page(p); +diff -uprN linux-2.6.15.orig/include/asm-x86_64/unistd.h linux-2.6.15-ve025stab014/include/asm-x86_64/unistd.h +--- linux-2.6.15.orig/include/asm-x86_64/unistd.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/asm-x86_64/unistd.h 2006-01-27 14:48:05.000000000 +0300 +@@ -571,8 +571,16 @@ __SYSCALL(__NR_inotify_init, sys_inotify + __SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch) + #define __NR_inotify_rm_watch 255 + __SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch) ++#define __NR_getluid 500 ++__SYSCALL(__NR_getluid, sys_getluid) ++#define __NR_setluid 501 ++__SYSCALL(__NR_setluid, sys_setluid) ++#define __NR_setublimit 502 ++__SYSCALL(__NR_setublimit, sys_setublimit) ++#define __NR_ubstat 503 ++__SYSCALL(__NR_ubstat, sys_ubstat) + +-#define __NR_syscall_max __NR_inotify_rm_watch ++#define __NR_syscall_max __NR_ubstat + #ifndef __NO_STUBS + + /* user-visible error numbers are in the range -1 - -4095 */ +diff -uprN linux-2.6.15.orig/include/linux/capability.h linux-2.6.15-ve025stab014/include/linux/capability.h +--- linux-2.6.15.orig/include/linux/capability.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/capability.h 2006-01-27 14:48:08.000000000 +0300 +@@ -145,12 +145,9 @@ typedef __u32 kernel_cap_t; + + #define CAP_NET_BROADCAST 11 + +-/* Allow interface configuration */ + /* Allow administration of IP firewall, masquerading and accounting */ + /* Allow setting debug option on sockets */ + /* Allow modification of routing tables */ +-/* Allow setting arbitrary process / process group ownership on +- sockets */ + /* Allow binding to any address for transparent proxying */ + /* Allow setting TOS (type of service) */ + /* Allow setting promiscuous mode */ +@@ -199,24 +196,19 @@ typedef __u32 kernel_cap_t; + + /* Allow configuration of the secure attention key */ + /* Allow administration of the random device */ +-/* Allow examination and configuration of disk quotas */ + /* Allow configuring the kernel's syslog (printk behaviour) */ + /* Allow setting the domainname */ + /* Allow setting the hostname */ + /* Allow calling bdflush() */ +-/* Allow mount() and umount(), setting up new smb connection */ ++/* Allow setting up new smb connection */ + /* Allow some autofs root ioctls */ + /* Allow nfsservctl */ + /* Allow VM86_REQUEST_IRQ */ + /* Allow to read/write pci config on alpha */ + /* Allow irix_prctl on mips (setstacksize) */ + /* Allow flushing all cache on m68k (sys_cacheflush) */ +-/* Allow removing semaphores */ +-/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores +- and shared memory */ + /* Allow locking/unlocking of shared memory segment */ + /* Allow turning swap on/off */ +-/* Allow forged pids on socket credentials passing */ + /* Allow setting readahead and flushing buffers on block devices */ + /* Allow setting geometry in floppy driver */ + /* Allow turning DMA on/off in xd driver */ +@@ -287,7 +279,52 @@ typedef __u32 kernel_cap_t; + + #define CAP_AUDIT_CONTROL 30 + ++/* ++ * Important note: VZ capabilities do intersect with CAP_AUDIT ++ * this is due to compatibility reasons. Nothing bad. ++ * Both VZ and Audit/SELinux caps are disabled in VPSs. ++ */ ++ ++/* Allow access to all information. In the other case some structures will be ++ hiding to ensure different Virtual Environment non-interaction on the same ++ node */ ++#define CAP_SETVEID 29 ++ ++#define CAP_VE_ADMIN 30 ++ + #ifdef __KERNEL__ ++ ++#include <linux/config.h> ++ ++#ifdef CONFIG_VE ++ ++/* Replacement for CAP_NET_ADMIN: ++ delegated rights to the Virtual environment of its network administration. ++ For now the following rights have been delegated: ++ ++ Allow setting arbitrary process / process group ownership on sockets ++ Allow interface configuration ++ */ ++#define CAP_VE_NET_ADMIN CAP_VE_ADMIN ++ ++/* Replacement for CAP_SYS_ADMIN: ++ delegated rights to the Virtual environment of its administration. ++ For now the following rights have been delegated: ++ */ ++/* Allow mount/umount/remount */ ++/* Allow examination and configuration of disk quotas */ ++/* Allow removing semaphores */ ++/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores ++ and shared memory */ ++/* Allow locking/unlocking of shared memory segment */ ++/* Allow forged pids on socket credentials passing */ ++ ++#define CAP_VE_SYS_ADMIN CAP_VE_ADMIN ++#else ++#define CAP_VE_NET_ADMIN CAP_NET_ADMIN ++#define CAP_VE_SYS_ADMIN CAP_SYS_ADMIN ++#endif ++ + /* + * Bounding set + */ +@@ -351,9 +388,14 @@ static inline kernel_cap_t cap_invert(ke + #define cap_issubset(a,set) (!(cap_t(a) & ~cap_t(set))) + + #define cap_clear(c) do { cap_t(c) = 0; } while(0) ++#ifndef CONFIG_VE + #define cap_set_full(c) do { cap_t(c) = ~0; } while(0) ++#else ++#define cap_set_full(c) \ ++ do {cap_t(c) = ve_is_super(get_exec_env()) ? ~0 : \ ++ get_exec_env()->cap_default; } while(0) ++#endif + #define cap_mask(c,mask) do { cap_t(c) &= cap_t(mask); } while(0) +- + #define cap_is_fs_cap(c) (CAP_TO_MASK(c) & CAP_FS_MASK) + + #endif /* __KERNEL__ */ +diff -uprN linux-2.6.15.orig/include/linux/dcache.h linux-2.6.15-ve025stab014/include/linux/dcache.h +--- linux-2.6.15.orig/include/linux/dcache.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/dcache.h 2006-01-27 14:48:08.000000000 +0300 +@@ -10,6 +10,8 @@ + #include <linux/rcupdate.h> + #include <asm/bug.h> + ++#include <ub/ub_dcache.h> ++ + struct nameidata; + struct vfsmount; + +@@ -105,6 +107,9 @@ struct dentry { + struct rcu_head d_rcu; + struct dcookie_struct *d_cookie; /* cookie, if any */ + int d_mounted; ++#ifdef CONFIG_USER_RESOURCE ++ struct dentry_beancounter dentry_bc; ++#endif + unsigned char d_iname[DNAME_INLINE_LEN_MIN]; /* small names */ + }; + +@@ -155,7 +160,11 @@ d_iput: no no no yes + + #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ + #define DCACHE_UNHASHED 0x0010 ++#define DCACHE_VIRTUAL 0x0100 /* ve accessible */ ++ ++extern void mark_tree_virtual(struct vfsmount *m, struct dentry *d); + ++extern kmem_cache_t *dentry_cache; + extern spinlock_t dcache_lock; + + /** +@@ -209,7 +218,8 @@ extern struct dentry * d_alloc_anon(stru + extern struct dentry * d_splice_alias(struct inode *, struct dentry *); + extern void shrink_dcache_sb(struct super_block *); + extern void shrink_dcache_parent(struct dentry *); +-extern void shrink_dcache_anon(struct hlist_head *); ++extern void shrink_dcache_anon(struct super_block *); ++extern void dcache_shrinker_wait_sb(struct super_block *sb); + extern int d_invalidate(struct dentry *); + + /* only used at mount-time */ +@@ -271,6 +281,7 @@ extern struct dentry * __d_lookup(struct + /* validate "insecure" dentry pointer */ + extern int d_validate(struct dentry *, struct dentry *); + ++extern int d_root_check(struct dentry *, struct vfsmount *); + extern char * d_path(struct dentry *, struct vfsmount *, char *, int); + + /* Allocation counts.. */ +@@ -291,6 +302,8 @@ extern char * d_path(struct dentry *, st + static inline struct dentry *dget(struct dentry *dentry) + { + if (dentry) { ++ if (ub_dget_testone(dentry)) ++ BUG(); + BUG_ON(!atomic_read(&dentry->d_count)); + atomic_inc(&dentry->d_count); + } +@@ -334,6 +347,8 @@ extern struct dentry *lookup_create(stru + + extern int sysctl_vfs_cache_pressure; + ++extern int check_area_access_ve(struct dentry *, struct vfsmount *); ++extern int check_area_execute_ve(struct dentry *, struct vfsmount *); + #endif /* __KERNEL__ */ + + #endif /* __LINUX_DCACHE_H */ +diff -uprN linux-2.6.15.orig/include/linux/devpts_fs.h linux-2.6.15-ve025stab014/include/linux/devpts_fs.h +--- linux-2.6.15.orig/include/linux/devpts_fs.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/devpts_fs.h 2006-01-27 14:48:08.000000000 +0300 +@@ -21,6 +21,15 @@ int devpts_pty_new(struct tty_struct *tt + struct tty_struct *devpts_get_tty(int number); /* get tty structure */ + void devpts_pty_kill(int number); /* unlink */ + ++struct devpts_config { ++ int setuid; ++ int setgid; ++ uid_t uid; ++ gid_t gid; ++ umode_t mode; ++}; ++ ++extern struct devpts_config devpts_config; + #else + + /* Dummy stubs in the no-pty case */ +diff -uprN linux-2.6.15.orig/include/linux/faudit.h linux-2.6.15-ve025stab014/include/linux/faudit.h +--- linux-2.6.15.orig/include/linux/faudit.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/faudit.h 2006-01-27 14:48:07.000000000 +0300 +@@ -0,0 +1,38 @@ ++/* ++ * include/linux/faudit.h ++ * ++ * Copyright (C) 2005 SWSoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __FAUDIT_H_ ++#define __FAUDIT_H_ ++ ++#include <linux/config.h> ++#include <linux/virtinfo.h> ++ ++struct vfsmount; ++struct dentry; ++struct pt_regs; ++ ++struct faudit_regs_arg { ++ int err; ++ struct pt_regs *regs; ++}; ++ ++struct faudit_stat_arg { ++ int err; ++ struct vfsmount *mnt; ++ struct dentry *dentry; ++ void *stat; ++}; ++ ++#define VIRTINFO_FAUDIT (0) ++#define VIRTINFO_FAUDIT_STAT (VIRTINFO_FAUDIT + 0) ++#define VIRTINFO_FAUDIT_STATFS (VIRTINFO_FAUDIT + 1) ++#define VIRTINFO_FAUDIT_STATFS64 (VIRTINFO_FAUDIT + 2) ++ ++#endif +diff -uprN linux-2.6.15.orig/include/linux/fs.h linux-2.6.15-ve025stab014/include/linux/fs.h +--- linux-2.6.15.orig/include/linux/fs.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/fs.h 2006-01-27 14:48:08.000000000 +0300 +@@ -7,6 +7,7 @@ + */ + + #include <linux/config.h> ++#include <linux/ve_owner.h> + #include <linux/limits.h> + #include <linux/ioctl.h> + #include <linux/rcuref.h> +@@ -64,6 +65,7 @@ extern int dir_notify_enable; + #define FMODE_LSEEK 4 + #define FMODE_PREAD 8 + #define FMODE_PWRITE FMODE_PREAD /* These go hand in hand */ ++#define FMODE_QUOTACTL 4 + + #define RW_MASK 1 + #define RWA_MASK 2 +@@ -83,6 +85,7 @@ extern int dir_notify_enable; + /* public flags for file_system_type */ + #define FS_REQUIRES_DEV 1 + #define FS_BINARY_MOUNTDATA 2 ++#define FS_VIRTUALIZED 64 /* Can mount this fstype inside ve */ + #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ + #define FS_ODD_RENAME 32768 /* Temporary stuff; will go away as soon + * as nfs_rename() will be cleaned up +@@ -301,6 +304,9 @@ struct iattr { + * Includes for diskquotas. + */ + #include <linux/quota.h> ++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) ++#include <linux/vzquota_qlnk.h> ++#endif + + /* + * oh the beauties of C type declarations. +@@ -464,6 +470,9 @@ struct inode { + #ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; + #endif ++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) ++ struct vz_quota_ilink i_qlnk; ++#endif + /* These three should probably be a union */ + struct list_head i_devices; + struct pipe_inode_info *i_pipe; +@@ -498,6 +507,8 @@ struct inode { + #endif + }; + ++extern kmem_cache_t *inode_cachep; ++ + /* + * NOTE: in a 32bit arch with a preemptable kernel and + * an UP compile the i_size_read/write must be atomic +@@ -617,7 +628,10 @@ struct file { + spinlock_t f_ep_lock; + #endif /* #ifdef CONFIG_EPOLL */ + struct address_space *f_mapping; ++ struct ve_struct *owner_env; + }; ++DCL_VE_OWNER_PROTO(FILP, struct file, owner_env) ++ + extern spinlock_t files_lock; + #define file_list_lock() spin_lock(&files_lock); + #define file_list_unlock() spin_unlock(&files_lock); +@@ -681,6 +695,9 @@ struct file_lock { + struct file *fl_file; + unsigned char fl_flags; + unsigned char fl_type; ++#ifdef CONFIG_USER_RESOURCE ++ unsigned char fl_charged; ++#endif + loff_t fl_start; + loff_t fl_end; + +@@ -803,6 +820,7 @@ struct super_block { + struct list_head s_io; /* parked for writeback */ + struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ + struct list_head s_files; ++ struct list_head s_dshrinkers; /* active dcache shrinkers */ + + struct block_device *s_bdev; + struct list_head s_instances; +@@ -1059,6 +1077,8 @@ struct super_operations { + + ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); + ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); ++ ++ struct inode *(*get_quota_root)(struct super_block *); + }; + + /* Inode state bits. Protected by inode_lock. */ +@@ -1221,8 +1241,14 @@ struct file_system_type { + struct module *owner; + struct file_system_type * next; + struct list_head fs_supers; ++ struct ve_struct *owner_env; + }; + ++DCL_VE_OWNER_PROTO(FSTYPE, struct file_system_type, owner_env) ++ ++void get_filesystem(struct file_system_type *fs); ++void put_filesystem(struct file_system_type *fs); ++ + struct super_block *get_sb_bdev(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + int (*fill_super)(struct super_block *, void *, int)); +@@ -1260,6 +1286,7 @@ extern struct vfsmount *kern_mount(struc + extern int may_umount_tree(struct vfsmount *); + extern int may_umount(struct vfsmount *); + extern void umount_tree(struct vfsmount *, int, struct list_head *); ++#define kern_umount mntput + extern void release_mounts(struct list_head *); + extern long do_mount(char *, char *, char *, unsigned long, void *); + extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); +@@ -1365,7 +1392,7 @@ extern int chrdev_open(struct inode *, s + #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ + extern const char *__bdevname(dev_t, char *buffer); + extern const char *bdevname(struct block_device *bdev, char *buffer); +-extern struct block_device *lookup_bdev(const char *); ++extern struct block_device *lookup_bdev(const char *, int mode); + extern struct block_device *open_bdev_excl(const char *, int, void *); + extern void close_bdev_excl(struct block_device *); + +diff -uprN linux-2.6.15.orig/include/linux/genhd.h linux-2.6.15-ve025stab014/include/linux/genhd.h +--- linux-2.6.15.orig/include/linux/genhd.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/genhd.h 2006-01-27 14:48:08.000000000 +0300 +@@ -421,6 +421,7 @@ static inline struct block_device *bdget + return bdget(MKDEV(disk->major, disk->first_minor) + index); + } + ++extern struct subsystem block_subsys; + #endif + + #endif +diff -uprN linux-2.6.15.orig/include/linux/gfp.h linux-2.6.15-ve025stab014/include/linux/gfp.h +--- linux-2.6.15.orig/include/linux/gfp.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/gfp.h 2006-01-27 14:48:05.000000000 +0300 +@@ -47,6 +47,8 @@ struct vm_area_struct; + #define __GFP_ZERO ((__force gfp_t)0x8000u)/* Return zeroed page on success */ + #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */ + #define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */ ++#define __GFP_UBC ((__force gfp_t)0x40000u)/* charge kmem in buddy and slab */ ++#define __GFP_SOFT_UBC ((__force gfp_t)0x80000u)/* use soft charging */ + + #define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */ + #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) +@@ -55,13 +57,16 @@ struct vm_area_struct; + #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \ + __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \ + __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \ +- __GFP_NOMEMALLOC|__GFP_HARDWALL) ++ __GFP_NOMEMALLOC|__GFP_HARDWALL| \ ++ __GFP_UBC|__GFP_SOFT_UBC) + + #define GFP_ATOMIC (__GFP_HIGH) + #define GFP_NOIO (__GFP_WAIT) + #define GFP_NOFS (__GFP_WAIT | __GFP_IO) + #define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) ++#define GFP_KERNEL_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_UBC) + #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) ++#define GFP_USER_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | __GFP_UBC) + #define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \ + __GFP_HIGHMEM) + +diff -uprN linux-2.6.15.orig/include/linux/inetdevice.h linux-2.6.15-ve025stab014/include/linux/inetdevice.h +--- linux-2.6.15.orig/include/linux/inetdevice.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/inetdevice.h 2006-01-27 14:48:08.000000000 +0300 +@@ -34,6 +34,12 @@ struct ipv4_devconf + }; + + extern struct ipv4_devconf ipv4_devconf; ++extern struct ipv4_devconf ipv4_devconf_dflt; ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ve_ipv4_devconf (*(get_exec_env()->_ipv4_devconf)) ++#else ++#define ve_ipv4_devconf ipv4_devconf ++#endif + + struct in_device + { +@@ -60,29 +66,29 @@ struct in_device + }; + + #define IN_DEV_FORWARD(in_dev) ((in_dev)->cnf.forwarding) +-#define IN_DEV_MFORWARD(in_dev) (ipv4_devconf.mc_forwarding && (in_dev)->cnf.mc_forwarding) +-#define IN_DEV_RPFILTER(in_dev) (ipv4_devconf.rp_filter && (in_dev)->cnf.rp_filter) +-#define IN_DEV_SOURCE_ROUTE(in_dev) (ipv4_devconf.accept_source_route && (in_dev)->cnf.accept_source_route) +-#define IN_DEV_BOOTP_RELAY(in_dev) (ipv4_devconf.bootp_relay && (in_dev)->cnf.bootp_relay) +- +-#define IN_DEV_LOG_MARTIANS(in_dev) (ipv4_devconf.log_martians || (in_dev)->cnf.log_martians) +-#define IN_DEV_PROXY_ARP(in_dev) (ipv4_devconf.proxy_arp || (in_dev)->cnf.proxy_arp) +-#define IN_DEV_SHARED_MEDIA(in_dev) (ipv4_devconf.shared_media || (in_dev)->cnf.shared_media) +-#define IN_DEV_TX_REDIRECTS(in_dev) (ipv4_devconf.send_redirects || (in_dev)->cnf.send_redirects) +-#define IN_DEV_SEC_REDIRECTS(in_dev) (ipv4_devconf.secure_redirects || (in_dev)->cnf.secure_redirects) ++#define IN_DEV_MFORWARD(in_dev) (ve_ipv4_devconf.mc_forwarding && (in_dev)->cnf.mc_forwarding) ++#define IN_DEV_RPFILTER(in_dev) (ve_ipv4_devconf.rp_filter && (in_dev)->cnf.rp_filter) ++#define IN_DEV_SOURCE_ROUTE(in_dev) (ve_ipv4_devconf.accept_source_route && (in_dev)->cnf.accept_source_route) ++#define IN_DEV_BOOTP_RELAY(in_dev) (ve_ipv4_devconf.bootp_relay && (in_dev)->cnf.bootp_relay) ++ ++#define IN_DEV_LOG_MARTIANS(in_dev) (ve_ipv4_devconf.log_martians || (in_dev)->cnf.log_martians) ++#define IN_DEV_PROXY_ARP(in_dev) (ve_ipv4_devconf.proxy_arp || (in_dev)->cnf.proxy_arp) ++#define IN_DEV_SHARED_MEDIA(in_dev) (ve_ipv4_devconf.shared_media || (in_dev)->cnf.shared_media) ++#define IN_DEV_TX_REDIRECTS(in_dev) (ve_ipv4_devconf.send_redirects || (in_dev)->cnf.send_redirects) ++#define IN_DEV_SEC_REDIRECTS(in_dev) (ve_ipv4_devconf.secure_redirects || (in_dev)->cnf.secure_redirects) + #define IN_DEV_IDTAG(in_dev) ((in_dev)->cnf.tag) + #define IN_DEV_MEDIUM_ID(in_dev) ((in_dev)->cnf.medium_id) + #define IN_DEV_PROMOTE_SECONDARIES(in_dev) (ipv4_devconf.promote_secondaries || (in_dev)->cnf.promote_secondaries) + + #define IN_DEV_RX_REDIRECTS(in_dev) \ + ((IN_DEV_FORWARD(in_dev) && \ +- (ipv4_devconf.accept_redirects && (in_dev)->cnf.accept_redirects)) \ ++ (ve_ipv4_devconf.accept_redirects && (in_dev)->cnf.accept_redirects)) \ + || (!IN_DEV_FORWARD(in_dev) && \ +- (ipv4_devconf.accept_redirects || (in_dev)->cnf.accept_redirects))) ++ (ve_ipv4_devconf.accept_redirects || (in_dev)->cnf.accept_redirects))) + +-#define IN_DEV_ARPFILTER(in_dev) (ipv4_devconf.arp_filter || (in_dev)->cnf.arp_filter) +-#define IN_DEV_ARP_ANNOUNCE(in_dev) (max(ipv4_devconf.arp_announce, (in_dev)->cnf.arp_announce)) +-#define IN_DEV_ARP_IGNORE(in_dev) (max(ipv4_devconf.arp_ignore, (in_dev)->cnf.arp_ignore)) ++#define IN_DEV_ARPFILTER(in_dev) (ve_ipv4_devconf.arp_filter || (in_dev)->cnf.arp_filter) ++#define IN_DEV_ARP_ANNOUNCE(in_dev) (max(ve_ipv4_devconf.arp_announce, (in_dev)->cnf.arp_announce)) ++#define IN_DEV_ARP_IGNORE(in_dev) (max(ve_ipv4_devconf.arp_ignore, (in_dev)->cnf.arp_ignore)) + + struct in_ifaddr + { +@@ -113,6 +119,7 @@ extern u32 inet_select_addr(const struc + extern u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scope); + extern struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix, u32 mask); + extern void inet_forward_change(void); ++extern void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy); + + static __inline__ int inet_ifa_match(u32 addr, struct in_ifaddr *ifa) + { +@@ -180,6 +187,10 @@ static inline void in_dev_put(struct in_ + #define __in_dev_put(idev) atomic_dec(&(idev)->refcnt) + #define in_dev_hold(idev) atomic_inc(&(idev)->refcnt) + ++struct ve_struct; ++extern int devinet_sysctl_init(struct ve_struct *); ++extern void devinet_sysctl_fini(struct ve_struct *); ++extern void devinet_sysctl_free(struct ve_struct *); + #endif /* __KERNEL__ */ + + static __inline__ __u32 inet_make_mask(int logmask) +diff -uprN linux-2.6.15.orig/include/linux/kdev_t.h linux-2.6.15-ve025stab014/include/linux/kdev_t.h +--- linux-2.6.15.orig/include/linux/kdev_t.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/kdev_t.h 2006-01-27 14:48:08.000000000 +0300 +@@ -87,6 +87,57 @@ static inline unsigned sysv_minor(u32 de + return dev & 0x3ffff; + } + ++#define UNNAMED_MAJOR_COUNT 16 ++ ++#if UNNAMED_MAJOR_COUNT > 1 ++ ++extern int unnamed_dev_majors[UNNAMED_MAJOR_COUNT]; ++ ++static inline dev_t make_unnamed_dev(int idx) ++{ ++ /* ++ * Here we transfer bits from 8 to 8+log2(UNNAMED_MAJOR_COUNT) of the ++ * unnamed device index into major number. ++ */ ++ return MKDEV(unnamed_dev_majors[(idx >> 8) & (UNNAMED_MAJOR_COUNT - 1)], ++ idx & ~((UNNAMED_MAJOR_COUNT - 1) << 8)); ++} ++ ++static inline int unnamed_dev_idx(dev_t dev) ++{ ++ int i; ++ for (i = 0; i < UNNAMED_MAJOR_COUNT && ++ MAJOR(dev) != unnamed_dev_majors[i]; i++); ++ return MINOR(dev) | (i << 8); ++} ++ ++static inline int is_unnamed_dev(dev_t dev) ++{ ++ int i; ++ for (i = 0; i < UNNAMED_MAJOR_COUNT && ++ MAJOR(dev) != unnamed_dev_majors[i]; i++); ++ return i < UNNAMED_MAJOR_COUNT; ++} ++ ++#else /* UNNAMED_MAJOR_COUNT */ ++ ++static inline dev_t make_unnamed_dev(int idx) ++{ ++ return MKDEV(0, idx); ++} ++ ++static inline int unnamed_dev_idx(dev_t dev) ++{ ++ return MINOR(dev); ++} ++ ++static inline int is_unnamed_dev(dev_t dev) ++{ ++ return MAJOR(dev) == 0; ++} ++ ++#endif /* UNNAMED_MAJOR_COUNT */ ++ + + #else /* __KERNEL__ */ + +diff -uprN linux-2.6.15.orig/include/linux/kernel.h linux-2.6.15-ve025stab014/include/linux/kernel.h +--- linux-2.6.15.orig/include/linux/kernel.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/kernel.h 2006-01-27 14:48:08.000000000 +0300 +@@ -128,6 +128,9 @@ asmlinkage int vprintk(const char *fmt, + __attribute__ ((format (printf, 1, 0))); + asmlinkage int printk(const char * fmt, ...) + __attribute__ ((format (printf, 1, 2))); ++asmlinkage int ve_printk(int, const char * fmt, ...) ++ __attribute__ ((format (printf, 2, 3))); ++void prepare_printk(void); + #else + static inline int vprintk(const char *s, va_list args) + __attribute__ ((format (printf, 1, 0))); +@@ -135,8 +138,16 @@ static inline int vprintk(const char *s, + static inline int printk(const char *s, ...) + __attribute__ ((format (printf, 1, 2))); + static inline int printk(const char *s, ...) { return 0; } ++static inline int ve_printk(int d, const char *s, ...) ++ __attribute__ ((format (printf, 1, 2))); ++static inline int printk(int d, const char *s, ...) { return 0; } ++#define prepare_printk() do { } while (0) + #endif + ++#define VE0_LOG 1 ++#define VE_LOG 2 ++#define VE_LOG_BOTH (VE0_LOG | VE_LOG) ++ + unsigned long int_sqrt(unsigned long); + + static inline int __attribute_pure__ long_log2(unsigned long x) +@@ -171,6 +182,7 @@ extern int oops_in_progress; /* If set, + extern __deprecated_for_modules int panic_timeout; + extern int panic_on_oops; + extern int tainted; ++extern int kernel_text_csum_broken; + extern const char *print_tainted(void); + extern void add_taint(unsigned); + +diff -uprN linux-2.6.15.orig/include/linux/kmem_cache.h linux-2.6.15-ve025stab014/include/linux/kmem_cache.h +--- linux-2.6.15.orig/include/linux/kmem_cache.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/kmem_cache.h 2006-01-27 14:48:05.000000000 +0300 +@@ -0,0 +1,188 @@ ++#ifndef __KMEM_CACHE_H__ ++#define __KMEM_CACHE_H__ ++ ++#include <linux/threads.h> ++#include <linux/smp.h> ++#include <linux/spinlock.h> ++#include <linux/list.h> ++#include <linux/mm.h> ++#include <asm/atomic.h> ++ ++/* ++ * SLAB_DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, ++ * SLAB_RED_ZONE & SLAB_POISON. ++ * 0 for faster, smaller code (especially in the critical paths). ++ * ++ * SLAB_STATS - 1 to collect stats for /proc/slabinfo. ++ * 0 for faster, smaller code (especially in the critical paths). ++ * ++ * SLAB_FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) ++ */ ++ ++#ifdef CONFIG_DEBUG_SLAB ++#define SLAB_DEBUG 1 ++#define SLAB_STATS 1 ++#define SLAB_FORCED_DEBUG 1 ++#else ++#define SLAB_DEBUG 0 ++#define SLAB_STATS 0 /* must be off, see kmem_cache.h */ ++#define SLAB_FORCED_DEBUG 0 ++#endif ++ ++/* ++ * struct array_cache ++ * ++ * Purpose: ++ * - LIFO ordering, to hand out cache-warm objects from _alloc ++ * - reduce the number of linked list operations ++ * - reduce spinlock operations ++ * ++ * The limit is stored in the per-cpu structure to reduce the data cache ++ * footprint. ++ * ++ */ ++struct array_cache { ++ unsigned int avail; ++ unsigned int limit; ++ unsigned int batchcount; ++ unsigned int touched; ++ spinlock_t lock; ++ void *entry[0]; /* ++ * Must have this definition in here for the proper ++ * alignment of array_cache. Also simplifies accessing ++ * the entries. ++ * [0] is for gcc 2.95. It should really be []. ++ */ ++}; ++ ++/* bootstrap: The caches do not work without cpuarrays anymore, ++ * but the cpuarrays are allocated from the generic caches... ++ */ ++#define BOOT_CPUCACHE_ENTRIES 1 ++struct arraycache_init { ++ struct array_cache cache; ++ void * entries[BOOT_CPUCACHE_ENTRIES]; ++}; ++ ++/* ++ * The slab lists for all objects. ++ */ ++struct kmem_list3 { ++ struct list_head slabs_partial; /* partial list first, better asm code */ ++ struct list_head slabs_full; ++ struct list_head slabs_free; ++ unsigned long free_objects; ++ unsigned long next_reap; ++ int free_touched; ++ unsigned int free_limit; ++ spinlock_t list_lock; ++ struct array_cache *shared; /* shared per node */ ++ struct array_cache **alien; /* on other nodes */ ++}; ++ ++/* ++ * kmem_cache_t ++ * ++ * manages a cache. ++ */ ++ ++struct kmem_cache { ++/* 1) per-cpu data, touched during every alloc/free */ ++ struct array_cache *array[NR_CPUS]; ++ unsigned int batchcount; ++ unsigned int limit; ++ unsigned int shared; ++ unsigned int objsize; ++/* 2) touched by every alloc & free from the backend */ ++ struct kmem_list3 *nodelists[MAX_NUMNODES]; ++ unsigned int flags; /* constant flags */ ++ unsigned int num; /* # of objs per slab */ ++ spinlock_t spinlock; ++ ++/* 3) cache_grow/shrink */ ++ /* order of pgs per slab (2^n) */ ++ unsigned int gfporder; ++ ++ /* force GFP flags, e.g. GFP_DMA */ ++ gfp_t gfpflags; ++ ++ size_t colour; /* cache colouring range */ ++ unsigned int colour_off; /* colour offset */ ++ unsigned int colour_next; /* cache colouring */ ++ kmem_cache_t *slabp_cache; ++ unsigned int slab_size; ++ unsigned int dflags; /* dynamic flags */ ++ ++ /* constructor func */ ++ void (*ctor)(void *, kmem_cache_t *, unsigned long); ++ ++ /* de-constructor func */ ++ void (*dtor)(void *, kmem_cache_t *, unsigned long); ++ ++/* 4) cache creation/removal */ ++ const char *name; ++ struct list_head next; ++ ++/* 5) statistics */ ++#if SLAB_STATS ++ unsigned long num_active; ++ unsigned long num_allocations; ++ unsigned long high_mark; ++ unsigned long grown; ++ unsigned long reaped; ++ unsigned long errors; ++ unsigned long max_freeable; ++ unsigned long node_allocs; ++ unsigned long node_frees; ++ atomic_t allochit; ++ atomic_t allocmiss; ++ atomic_t freehit; ++ atomic_t freemiss; ++#endif ++#if SLAB_DEBUG ++ int dbghead; ++ int reallen; ++#endif ++#ifdef CONFIG_USER_RESOURCE ++ unsigned int objuse; ++#endif ++}; ++ ++struct slab; ++ ++/* Functions and macros for storing/retrieving the cachep and or slab from the ++ * global 'mem_map'. These are used to find the slab an obj belongs to. ++ * With kfree(), these are used to find the cache which an obj belongs to. ++ */ ++static inline void page_set_cache(struct page *page, struct kmem_cache *cache) ++{ ++ page->lru.next = (struct list_head *)cache; ++} ++ ++static inline struct kmem_cache *page_get_cache(struct page *page) ++{ ++ return (struct kmem_cache *)page->lru.next; ++} ++ ++static inline void page_set_slab(struct page *page, struct slab *slab) ++{ ++ page->lru.prev = (struct list_head *)slab; ++} ++ ++static inline struct slab *page_get_slab(struct page *page) ++{ ++ return (struct slab *)page->lru.prev; ++} ++ ++#define SET_PAGE_CACHE(pg,x) page_set_cache(pg,x) ++#define GET_PAGE_CACHE(pg) page_get_cache(pg) ++#define SET_PAGE_SLAB(pg,x) page_set_slab(pg,x) ++#define GET_PAGE_SLAB(pg) page_get_slab(pg) ++ ++#define CFLGS_OFF_SLAB (0x80000000UL) ++#define CFLGS_ENVIDS (0x04000000UL) ++#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) ++#define ENVIDS(x) ((x)->flags & CFLGS_ENVIDS) ++ ++#define kmem_mark_nocharge(c) do { (c)->flags |= SLAB_NO_CHARGE; } while (0) ++#endif /* __KMEM_CACHE_H__ */ +diff -uprN linux-2.6.15.orig/include/linux/kmem_slab.h linux-2.6.15-ve025stab014/include/linux/kmem_slab.h +--- linux-2.6.15.orig/include/linux/kmem_slab.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/kmem_slab.h 2006-01-27 14:48:05.000000000 +0300 +@@ -0,0 +1,71 @@ ++#ifndef __KMEM_SLAB_H__ ++#define __KMEM_SLAB_H__ ++ ++/* ++ * kmem_bufctl_t: ++ * ++ * Bufctl's are used for linking objs within a slab ++ * linked offsets. ++ * ++ * This implementation relies on "struct page" for locating the cache & ++ * slab an object belongs to. ++ * This allows the bufctl structure to be small (one int), but limits ++ * the number of objects a slab (not a cache) can contain when off-slab ++ * bufctls are used. The limit is the size of the largest general cache ++ * that does not use off-slab slabs. ++ * For 32bit archs with 4 kB pages, is this 56. ++ * This is not serious, as it is only for large objects, when it is unwise ++ * to have too many per slab. ++ * Note: This limit can be raised by introducing a general cache whose size ++ * is less than 512 (PAGE_SIZE<<3), but greater than 256. ++ */ ++ ++typedef unsigned int kmem_bufctl_t; ++#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) ++#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) ++#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) ++ ++/* ++ * struct slab ++ * ++ * Manages the objs in a slab. Placed either at the beginning of mem allocated ++ * for a slab, or allocated from an general cache. ++ * Slabs are chained into three list: fully used, partial, fully free slabs. ++ */ ++struct slab { ++ struct list_head list; ++ unsigned long colouroff; ++ void *s_mem; /* including colour offset */ ++ unsigned int inuse; /* num of objs active in slab */ ++ kmem_bufctl_t free; ++ unsigned short nodeid; ++}; ++ ++/* ++ * struct slab_rcu ++ * ++ * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to ++ * arrange for kmem_freepages to be called via RCU. This is useful if ++ * we need to approach a kernel structure obliquely, from its address ++ * obtained without the usual locking. We can lock the structure to ++ * stabilize it and check it's still at the given address, only if we ++ * can be sure that the memory has not been meanwhile reused for some ++ * other kind of object (which our subsystem's lock might corrupt). ++ * ++ * rcu_read_lock before reading the address, then rcu_read_unlock after ++ * taking the spinlock within the structure expected at that address. ++ * ++ * We assume struct slab_rcu can overlay struct slab when destroying. ++ */ ++struct slab_rcu { ++ struct rcu_head head; ++ kmem_cache_t *cachep; ++ void *addr; ++}; ++ ++static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) ++{ ++ return (kmem_bufctl_t *)(slabp+1); ++} ++ ++#endif /* __KMEM_SLAB_H__ */ +diff -uprN linux-2.6.15.orig/include/linux/list.h linux-2.6.15-ve025stab014/include/linux/list.h +--- linux-2.6.15.orig/include/linux/list.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/list.h 2006-01-27 14:48:08.000000000 +0300 +@@ -409,6 +409,20 @@ static inline void list_splice_init(stru + pos = list_entry(pos->member.next, typeof(*pos), member)) + + /** ++ * list_for_each_entry_continue_reverse - iterate backwards over list of given ++ * type continuing after existing point ++ * @pos: the type * to use as a loop counter. ++ * @head: the head for your list. ++ * @member: the name of the list_struct within the struct. ++ */ ++#define list_for_each_entry_continue_reverse(pos, head, member) \ ++ for (pos = list_entry(pos->member.prev, typeof(*pos), member), \ ++ prefetch(pos->member.prev); \ ++ &pos->member != (head); \ ++ pos = list_entry(pos->member.prev, typeof(*pos), member), \ ++ prefetch(pos->member.prev)) ++ ++/** + * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop counter. + * @n: another type * to use as temporary storage +diff -uprN linux-2.6.15.orig/include/linux/major.h linux-2.6.15-ve025stab014/include/linux/major.h +--- linux-2.6.15.orig/include/linux/major.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/major.h 2006-01-27 14:48:08.000000000 +0300 +@@ -165,4 +165,7 @@ + + #define VIOTAPE_MAJOR 230 + ++#define UNNAMED_EXTRA_MAJOR 130 ++#define UNNAMED_EXTRA_MAJOR_COUNT 120 ++ + #endif +diff -uprN linux-2.6.15.orig/include/linux/mm.h linux-2.6.15-ve025stab014/include/linux/mm.h +--- linux-2.6.15.orig/include/linux/mm.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/mm.h 2006-01-27 14:48:05.000000000 +0300 +@@ -39,6 +39,27 @@ extern int sysctl_legacy_va_layout; + + #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) + ++#include <linux/mm_counter.h> ++ ++#ifdef CONFIG_USER_RESOURCE ++#define set_vma_rss(vma, v) set_mm_counter(vma, vm_rss, v) ++#define get_vma_rss(vma) get_mm_counter(vma, vm_rss) ++#define inc_vma_rss(vma) inc_mm_counter(vma, vm_rss) ++#define dec_vma_rss(vma) dec_mm_counter(vma, vm_rss) ++#define add_vma_rss(vma, v) add_mm_counter(vma, vm_rss, v) ++#define sub_vma_rss(vma, v) do { \ ++ if (unlikely(dec_mm_counter_chk(vma, vm_rss, v))) \ ++ warn_bad_rss(vma, v); \ ++ } while (0) ++#else ++#define set_vma_rss(vma, v) do { } while (0) ++#define get_vma_rss(vma) (0) ++#define inc_vma_rss(vma) do { } while (0) ++#define dec_vma_rss(vma) do { } while (0) ++#define add_vma_rss(vma, v) do { } while (0) ++#define sub_vma_rss(vma, v) do { } while (0) ++#endif ++ + /* + * Linux kernel virtual memory manager primitives. + * The idea being to have a "virtual" mm in the same way +@@ -109,6 +130,9 @@ struct vm_area_struct { + #ifdef CONFIG_NUMA + struct mempolicy *vm_policy; /* NUMA policy for the VMA */ + #endif ++#ifdef CONFIG_USER_RESOURCE ++ mm_counter_t _vm_rss; ++#endif + }; + + /* +@@ -259,6 +283,12 @@ struct page { + void *virtual; /* Kernel virtual address (NULL if + not kmapped, ie. highmem) */ + #endif /* WANT_PAGE_VIRTUAL */ ++#ifdef CONFIG_USER_RESOURCE ++ union { ++ struct user_beancounter *page_ub; ++ struct page_beancounter *page_pb; ++ } bc; ++#endif + }; + + #define page_private(page) ((page)->u.private) +@@ -631,10 +661,8 @@ struct page *shmem_nopage(struct vm_area + int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new); + struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, + unsigned long addr); +-int shmem_lock(struct file *file, int lock, struct user_struct *user); + #else + #define shmem_nopage filemap_nopage +-#define shmem_lock(a, b, c) ({0;}) /* always in memory, no need to lock */ + #define shmem_set_policy(a, b) (0) + #define shmem_get_policy(a, b) (NULL) + #endif +@@ -677,7 +705,7 @@ void free_pgd_range(struct mmu_gather ** + void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma, + unsigned long floor, unsigned long ceiling); + int copy_page_range(struct mm_struct *dst, struct mm_struct *src, +- struct vm_area_struct *vma); ++ struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); + int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, + unsigned long size, pgprot_t prot); + void unmap_mapping_range(struct address_space *mapping, +@@ -964,6 +992,7 @@ struct page *follow_page(struct vm_area_ + #define FOLL_TOUCH 0x02 /* mark page accessed */ + #define FOLL_GET 0x04 /* do get_page on page */ + #define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ ++#define FOLL_KERN 0x10 /* lookup kernel page */ + + #ifdef CONFIG_PROC_FS + void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); +diff -uprN linux-2.6.15.orig/include/linux/mm_counter.h linux-2.6.15-ve025stab014/include/linux/mm_counter.h +--- linux-2.6.15.orig/include/linux/mm_counter.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/mm_counter.h 2006-01-27 14:48:05.000000000 +0300 +@@ -0,0 +1,44 @@ ++#ifndef __MM_COUNTER_H_ ++#define __MM_COUNTER_H_ ++#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS ++/* ++ * The mm counters are not protected by its page_table_lock, ++ * so must be incremented atomically. ++ */ ++#ifdef ATOMIC64_INIT ++#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value) ++#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member)) ++#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member) ++#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member) ++#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member) ++#define dec_mm_counter_chk(mm, member, value) atomic64_add_negative(-(value), &(mm)->_##member) ++typedef atomic64_t mm_counter_t; ++#else /* !ATOMIC64_INIT */ ++/* ++ * The counters wrap back to 0 at 2^32 * PAGE_SIZE, ++ * that is, at 16TB if using 4kB page size. ++ */ ++#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value) ++#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member)) ++#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member) ++#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member) ++#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member) ++#define dec_mm_counter_chk(mm, member, value) atomic_add_negative(-(value), &(mm)->_##member) ++typedef atomic_t mm_counter_t; ++#endif /* !ATOMIC64_INIT */ ++ ++#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ ++/* ++ * The mm counters are protected by its page_table_lock, ++ * so can be incremented directly. ++ */ ++#define set_mm_counter(mm, member, value) (mm)->_##member = (value) ++#define get_mm_counter(mm, member) ((mm)->_##member) ++#define add_mm_counter(mm, member, value) (mm)->_##member += (value) ++#define inc_mm_counter(mm, member) (mm)->_##member++ ++#define dec_mm_counter(mm, member) (mm)->_##member-- ++#define dec_mm_counter_chk(mm, member, value) (((mm)->_##member -= (value)) < 0) ++typedef unsigned long mm_counter_t; ++ ++#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ ++#endif +diff -uprN linux-2.6.15.orig/include/linux/namei.h linux-2.6.15-ve025stab014/include/linux/namei.h +--- linux-2.6.15.orig/include/linux/namei.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/namei.h 2006-01-27 14:48:08.000000000 +0300 +@@ -48,12 +48,15 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA + #define LOOKUP_PARENT 16 + #define LOOKUP_NOALT 32 + #define LOOKUP_REVAL 64 ++#define LOOKUP_STRICT 128 /* no symlinks or other filesystems */ ++ + /* + * Intent data + */ + #define LOOKUP_OPEN (0x0100) + #define LOOKUP_CREATE (0x0200) + #define LOOKUP_ACCESS (0x0400) ++#define LOOKUP_NOAREACHECK (0x0800) /* no area check on lookup */ + + extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *)); + #define user_path_walk(name,nd) \ +diff -uprN linux-2.6.15.orig/include/linux/namespace.h linux-2.6.15-ve025stab014/include/linux/namespace.h +--- linux-2.6.15.orig/include/linux/namespace.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/namespace.h 2006-01-27 14:48:08.000000000 +0300 +@@ -13,6 +13,8 @@ struct namespace { + int event; + }; + ++extern struct rw_semaphore namespace_sem; ++ + extern int copy_namespace(int, struct task_struct *); + extern void __put_namespace(struct namespace *namespace); + +diff -uprN linux-2.6.15.orig/include/linux/netdevice.h linux-2.6.15-ve025stab014/include/linux/netdevice.h +--- linux-2.6.15.orig/include/linux/netdevice.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/netdevice.h 2006-01-27 14:48:08.000000000 +0300 +@@ -37,6 +37,7 @@ + #include <linux/config.h> + #include <linux/device.h> + #include <linux/percpu.h> ++#include <linux/ctype.h> + + struct divert_blk; + struct vlan_group; +@@ -233,6 +234,11 @@ enum netdev_state_t + __LINK_STATE_LINKWATCH_PENDING + }; + ++struct netdev_bc { ++ struct user_beancounter *exec_ub, *owner_ub; ++}; ++ ++#define netdev_bc(dev) (&(dev)->dev_bc) + + /* + * This structure holds at boot time configured netdevice settings. They +@@ -309,6 +315,7 @@ struct net_device + #define NETIF_F_TSO 2048 /* Can offload TCP/IP segmentation */ + #define NETIF_F_LLTX 4096 /* LockLess TX */ + #define NETIF_F_UFO 8192 /* Can offload UDP Large Send*/ ++#define NETIF_F_VENET 0x80000000 /* Device is VENET device */ + + struct net_device *next_sched; + +@@ -431,6 +438,7 @@ struct net_device + enum { NETREG_UNINITIALIZED=0, + NETREG_REGISTERING, /* called register_netdevice */ + NETREG_REGISTERED, /* completed register todo */ ++ NETREG_REGISTER_ERR, /* register todo failed */ + NETREG_UNREGISTERING, /* called unregister_netdevice */ + NETREG_UNREGISTERED, /* completed unregister todo */ + NETREG_RELEASED, /* called free_netdev */ +@@ -500,8 +508,18 @@ struct net_device + struct divert_blk *divert; + #endif /* CONFIG_NET_DIVERT */ + ++ unsigned orig_mtu; /* MTU value before move to VE */ ++ struct ve_struct *owner_env; /* Owner VE of the interface */ ++ struct netdev_bc dev_bc; ++ + /* class/net/name entry */ + struct class_device class_dev; ++ ++#ifdef CONFIG_VE ++ /* List entry in global devices list to keep track of their names ++ * assignment */ ++ struct list_head dev_global_list_entry; ++#endif + }; + + #define NETDEV_ALIGN 32 +@@ -535,9 +553,22 @@ struct packet_type { + #include <linux/notifier.h> + + extern struct net_device loopback_dev; /* The loopback */ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define visible_loopback_dev (*get_exec_env()->_loopback_dev) ++#define dev_base (get_exec_env()->_net_dev_base) ++#define visible_dev_head(x) (&(x)->_net_dev_head) ++#define visible_dev_index_head(x) (&(x)->_net_dev_index_head) ++#else ++#define visible_loopback_dev loopback_dev + extern struct net_device *dev_base; /* All devices */ ++#define visible_dev_head(x) NULL ++#define visible_dev_index_head(x) NULL ++#endif + extern rwlock_t dev_base_lock; /* Device list lock */ + ++struct hlist_head *dev_name_hash(const char *name, struct ve_struct *env); ++struct hlist_head *dev_index_hash(int ifindex, struct ve_struct *env); ++ + extern int netdev_boot_setup_check(struct net_device *dev); + extern unsigned long netdev_boot_base(const char *prefix, int unit); + extern struct net_device *dev_getbyhwaddr(unsigned short type, char *hwaddr); +@@ -554,6 +585,7 @@ extern int dev_alloc_name(struct net_de + extern int dev_open(struct net_device *dev); + extern int dev_close(struct net_device *dev); + extern int dev_queue_xmit(struct sk_buff *skb); ++extern int dev_set_mtu(struct net_device *dev, int new_mtu); + extern int register_netdevice(struct net_device *dev); + extern int unregister_netdevice(struct net_device *dev); + extern void free_netdev(struct net_device *dev); +@@ -946,6 +978,29 @@ extern void dev_seq_stop(struct seq_file + + extern void linkwatch_run_queue(void); + ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++static inline int ve_is_dev_movable(struct net_device *dev) ++{ ++ if (strcmp(dev->name, "lo") == 0) ++ return 0; ++ else if ((strncmp(dev->name, "venet", 5) == 0) && ++ (strlen(dev->name) > 5) && isdigit(dev->name[5])) ++ return 0; ++ else if ((strncmp(dev->name, "tun", 3) == 0) && ++ (strlen(dev->name) > 3) && isdigit(dev->name[3])) ++ return 0; ++ else if ((strncmp(dev->name, "tap", 3) == 0) && ++ (strlen(dev->name) > 3) && isdigit(dev->name[3])) ++ return 0; ++ return 1; ++} ++#else ++static inline int ve_is_dev_movable(struct net_device *dev) ++{ ++ return 0; ++} ++#endif ++ + #endif /* __KERNEL__ */ + + #endif /* _LINUX_DEV_H */ +diff -uprN linux-2.6.15.orig/include/linux/netfilter/nf_conntrack_ftp.h linux-2.6.15-ve025stab014/include/linux/netfilter/nf_conntrack_ftp.h +--- linux-2.6.15.orig/include/linux/netfilter/nf_conntrack_ftp.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/netfilter/nf_conntrack_ftp.h 2006-01-27 14:48:08.000000000 +0300 +@@ -32,13 +32,22 @@ struct ip_conntrack_expect; + + /* For NAT to hook in when we find a packet which describes what other + * connection we should expect. */ +-extern unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb, ++typedef unsigned int (*ip_nat_helper_ftp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + enum ip_ct_ftp_type type, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp, + u32 *seq); ++extern ip_nat_helper_ftp_hook ip_nat_ftp_hook; ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_ip_nat_ftp_hook \ ++ ((ip_nat_helper_ftp_hook) \ ++ (get_exec_env()->_ip_conntrack->_ip_nat_ftp_hook)) ++#else ++#define ve_ip_nat_ftp_hook ip_nat_ftp_hook ++#endif + #endif /* __KERNEL__ */ + + #endif /* _NF_CONNTRACK_FTP_H */ +diff -uprN linux-2.6.15.orig/include/linux/netfilter.h linux-2.6.15-ve025stab014/include/linux/netfilter.h +--- linux-2.6.15.orig/include/linux/netfilter.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/netfilter.h 2006-01-27 14:48:08.000000000 +0300 +@@ -39,6 +39,8 @@ + #define NFC_ALTERED 0x8000 + #endif + ++#define NFC_IPT_MASK (0x00FFFFFF) ++ + #ifdef __KERNEL__ + #include <linux/config.h> + #ifdef CONFIG_NETFILTER +@@ -107,12 +109,21 @@ struct nf_info + int nf_register_hook(struct nf_hook_ops *reg); + void nf_unregister_hook(struct nf_hook_ops *reg); + ++int virt_nf_register_hook(struct nf_hook_ops *reg); ++int virt_nf_unregister_hook(struct nf_hook_ops *reg); ++ + /* Functions to register get/setsockopt ranges (non-inclusive). You + need to check permissions yourself! */ + int nf_register_sockopt(struct nf_sockopt_ops *reg); + void nf_unregister_sockopt(struct nf_sockopt_ops *reg); + ++#ifdef CONFIG_VE_IPTABLES ++#define ve_nf_hooks \ ++ ((struct list_head (*)[NF_MAX_HOOKS])(get_exec_env()->_nf_hooks)) ++#else + extern struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; ++#define ve_nf_hooks nf_hooks ++#endif + + /* those NF_LOG_* defines and struct nf_loginfo are legacy definitios that will + * disappear once iptables is replaced with pkttables. Please DO NOT use them +@@ -202,13 +213,13 @@ __ret;}) + #else + #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) \ + ({int __ret; \ +-if (list_empty(&nf_hooks[pf][hook]) || \ ++if (list_empty(&ve_nf_hooks[pf][hook]) || \ + (__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, INT_MIN)) == 1) \ + __ret = (okfn)(skb); \ + __ret;}) + #define NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, thresh) \ + ({int __ret; \ +-if (list_empty(&nf_hooks[pf][hook]) || \ ++if (list_empty(&ve_nf_hooks[pf][hook]) || \ + (__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, thresh)) == 1) \ + __ret = (okfn)(skb); \ + __ret;}) +diff -uprN linux-2.6.15.orig/include/linux/netfilter_ipv4/ip_conntrack.h linux-2.6.15-ve025stab014/include/linux/netfilter_ipv4/ip_conntrack.h +--- linux-2.6.15.orig/include/linux/netfilter_ipv4/ip_conntrack.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/netfilter_ipv4/ip_conntrack.h 2006-01-27 14:48:08.000000000 +0300 +@@ -71,6 +71,11 @@ do { \ + + struct ip_conntrack_helper; + ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/ve.h> ++#include <linux/ve_owner.h> ++#endif ++ + struct ip_conntrack + { + /* Usage count in here is 1 for hash table/destruct timer, 1 per skb, +@@ -122,8 +127,15 @@ struct ip_conntrack + /* Traversed often, so hopefully in different cacheline to top */ + /* These are my tuples; original and reply */ + struct ip_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX]; ++#ifdef CONFIG_VE_IPTABLES ++ struct ve_struct *ct_owner_env; ++#endif + }; + ++#ifdef CONFIG_VE_IPTABLES ++DCL_VE_OWNER_PROTO(CT, struct ip_conntrack, ct_owner_env) ++#endif ++ + struct ip_conntrack_expect + { + /* Internal linked list (global expectation list) */ +@@ -235,7 +247,15 @@ extern void ip_conntrack_tcp_update(stru + enum ip_conntrack_dir dir); + + /* Call me when a conntrack is destroyed. */ ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_ip_conntrack_destroyed \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_destroyed) ++#else + extern void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack); ++#define ve_ip_conntrack_destroyed ip_conntrack_destroyed ++#endif ++ + + /* Fake conntrack entry for untracked connections */ + extern struct ip_conntrack ip_conntrack_untracked; +@@ -264,7 +284,7 @@ extern void ip_conntrack_proto_put(struc + extern void ip_ct_remove_expectations(struct ip_conntrack *ct); + + extern struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *, +- struct ip_conntrack_tuple *); ++ struct ip_conntrack_tuple *, struct user_beancounter *); + + extern void ip_conntrack_free(struct ip_conntrack *ct); + +@@ -294,6 +314,7 @@ static inline int is_dying(struct ip_con + } + + extern unsigned int ip_conntrack_htable_size; ++extern int ip_conntrack_disable_ve0; + + #define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++) + +diff -uprN linux-2.6.15.orig/include/linux/netfilter_ipv4/ip_conntrack_core.h linux-2.6.15-ve025stab014/include/linux/netfilter_ipv4/ip_conntrack_core.h +--- linux-2.6.15.orig/include/linux/netfilter_ipv4/ip_conntrack_core.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/netfilter_ipv4/ip_conntrack_core.h 2006-01-27 14:48:08.000000000 +0300 +@@ -3,7 +3,6 @@ + #include <linux/netfilter.h> + + #define MAX_IP_CT_PROTO 256 +-extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; + + /* This header is used to share core functionality between the + standalone connection tracking module, and the compatibility layer's use +@@ -54,8 +53,26 @@ static inline int ip_conntrack_confirm(s + + extern void ip_ct_unlink_expect(struct ip_conntrack_expect *exp); + ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_ip_ct_protos \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_protos) ++#define ve_ip_conntrack_hash \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_hash) ++#define ve_ip_conntrack_expect_list \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_expect_list) ++#define ve_ip_conntrack_vmalloc \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_vmalloc) ++#else ++extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; + extern struct list_head *ip_conntrack_hash; + extern struct list_head ip_conntrack_expect_list; ++#define ve_ip_ct_protos ip_ct_protos ++#define ve_ip_conntrack_hash ip_conntrack_hash ++#define ve_ip_conntrack_expect_list ip_conntrack_expect_list ++#define ve_ip_conntrack_vmalloc ip_conntrack_vmalloc ++#endif /* CONFIG_VE_IPTABLES */ ++ + extern rwlock_t ip_conntrack_lock; + #endif /* _IP_CONNTRACK_CORE_H */ + +diff -uprN linux-2.6.15.orig/include/linux/netfilter_ipv4/ip_conntrack_helper.h linux-2.6.15-ve025stab014/include/linux/netfilter_ipv4/ip_conntrack_helper.h +--- linux-2.6.15.orig/include/linux/netfilter_ipv4/ip_conntrack_helper.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/netfilter_ipv4/ip_conntrack_helper.h 2006-01-27 14:48:08.000000000 +0300 +@@ -31,6 +31,9 @@ struct ip_conntrack_helper + extern int ip_conntrack_helper_register(struct ip_conntrack_helper *); + extern void ip_conntrack_helper_unregister(struct ip_conntrack_helper *); + ++extern int virt_ip_conntrack_helper_register(struct ip_conntrack_helper *); ++extern void virt_ip_conntrack_helper_unregister(struct ip_conntrack_helper *); ++ + /* Allocate space for an expectation: this is mandatory before calling + ip_conntrack_expect_related. You will have to call put afterwards. */ + extern struct ip_conntrack_expect * +@@ -41,4 +44,5 @@ extern void ip_conntrack_expect_put(stru + extern int ip_conntrack_expect_related(struct ip_conntrack_expect *exp); + extern void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp); + ++extern struct list_head helpers; + #endif /*_IP_CONNTRACK_HELPER_H*/ +diff -uprN linux-2.6.15.orig/include/linux/netfilter_ipv4/ip_conntrack_irc.h linux-2.6.15-ve025stab014/include/linux/netfilter_ipv4/ip_conntrack_irc.h +--- linux-2.6.15.orig/include/linux/netfilter_ipv4/ip_conntrack_irc.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/netfilter_ipv4/ip_conntrack_irc.h 2006-01-27 14:48:08.000000000 +0300 +@@ -14,16 +14,26 @@ + #ifndef _IP_CONNTRACK_IRC_H + #define _IP_CONNTRACK_IRC_H + ++#include <linux/netfilter_ipv4/ip_conntrack_helper.h> ++ + /* This structure exists only once per master */ + struct ip_ct_irc_master { + }; + + #ifdef __KERNEL__ +-extern unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, +- enum ip_conntrack_info ctinfo, +- unsigned int matchoff, +- unsigned int matchlen, +- struct ip_conntrack_expect *exp); ++typedef unsigned int (*ip_nat_helper_irc_hook)(struct sk_buff **, ++ enum ip_conntrack_info, unsigned int, unsigned int, ++ struct ip_conntrack_expect *); ++ ++extern ip_nat_helper_irc_hook ip_nat_irc_hook; ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_ip_nat_irc_hook \ ++ ((ip_nat_helper_irc_hook) \ ++ (get_exec_env()->_ip_conntrack->_ip_nat_irc_hook)) ++#else ++#define ve_ip_nat_irc_hook ip_nat_irc_hook ++#endif + + #define IRC_PORT 6667 + +diff -uprN linux-2.6.15.orig/include/linux/netfilter_ipv4/ip_conntrack_protocol.h linux-2.6.15-ve025stab014/include/linux/netfilter_ipv4/ip_conntrack_protocol.h +--- linux-2.6.15.orig/include/linux/netfilter_ipv4/ip_conntrack_protocol.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/netfilter_ipv4/ip_conntrack_protocol.h 2006-01-27 14:48:08.000000000 +0300 +@@ -67,6 +67,7 @@ struct ip_conntrack_protocol + /* Protocol registration. */ + extern int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto); + extern void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto); ++ + /* Existing built-in protocols */ + extern struct ip_conntrack_protocol ip_conntrack_protocol_tcp; + extern struct ip_conntrack_protocol ip_conntrack_protocol_udp; +@@ -74,6 +75,41 @@ extern struct ip_conntrack_protocol ip_c + extern struct ip_conntrack_protocol ip_conntrack_generic_protocol; + extern int ip_conntrack_protocol_tcp_init(void); + ++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) ++#include <linux/sched.h> ++#define ve_ip_ct_tcp_timeouts \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_timeouts) ++#define ve_ip_ct_udp_timeout \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_udp_timeout) ++#define ve_ip_ct_udp_timeout_stream \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_udp_timeout_stream) ++#define ve_ip_ct_icmp_timeout \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_icmp_timeout) ++#define ve_ip_ct_generic_timeout \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_generic_timeout) ++#define ve_ip_ct_log_invalid \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_log_invalid) ++#define ve_ip_ct_tcp_timeout_max_retrans \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_timeout_max_retrans) ++#define ve_ip_ct_tcp_loose \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_loose) ++#define ve_ip_ct_tcp_be_liberal \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_be_liberal) ++#define ve_ip_ct_tcp_max_retrans \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_max_retrans) ++#else ++#define ve_ip_ct_tcp_timeouts *tcp_timeouts ++#define ve_ip_ct_udp_timeout ip_ct_udp_timeout ++#define ve_ip_ct_udp_timeout_stream ip_ct_udp_timeout_stream ++#define ve_ip_ct_icmp_timeout ip_ct_icmp_timeout ++#define ve_ip_ct_generic_timeout ip_ct_generic_timeout ++#define ve_ip_ct_log_invalid ip_ct_log_invalid ++#define ve_ip_ct_tcp_timeout_max_retrans ip_ct_tcp_timeout_max_retrans ++#define ve_ip_ct_tcp_loose ip_ct_tcp_loose ++#define ve_ip_ct_tcp_be_liberal ip_ct_tcp_be_liberal ++#define ve_ip_ct_tcp_max_retrans ip_ct_tcp_max_retrans ++#endif ++ + /* Log invalid packets */ + extern unsigned int ip_ct_log_invalid; + +@@ -85,10 +121,10 @@ extern int ip_ct_port_nfattr_to_tuple(st + #ifdef CONFIG_SYSCTL + #ifdef DEBUG_INVALID_PACKETS + #define LOG_INVALID(proto) \ +- (ip_ct_log_invalid == (proto) || ip_ct_log_invalid == IPPROTO_RAW) ++ (ve_ip_ct_log_invalid == (proto) || ve_ip_ct_log_invalid == IPPROTO_RAW) + #else + #define LOG_INVALID(proto) \ +- ((ip_ct_log_invalid == (proto) || ip_ct_log_invalid == IPPROTO_RAW) \ ++ ((ve_ip_ct_log_invalid == (proto) || ve_ip_ct_log_invalid == IPPROTO_RAW) \ + && net_ratelimit()) + #endif + #else +diff -uprN linux-2.6.15.orig/include/linux/netfilter_ipv4/ip_nat_core.h linux-2.6.15-ve025stab014/include/linux/netfilter_ipv4/ip_nat_core.h +--- linux-2.6.15.orig/include/linux/netfilter_ipv4/ip_nat_core.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/netfilter_ipv4/ip_nat_core.h 2006-01-27 14:48:08.000000000 +0300 +@@ -15,4 +15,5 @@ extern int ip_nat_icmp_reply_translation + struct ip_conntrack *ct, + enum ip_nat_manip_type manip, + enum ip_conntrack_dir dir); ++ + #endif /* _IP_NAT_CORE_H */ +diff -uprN linux-2.6.15.orig/include/linux/netfilter_ipv4/ip_nat_rule.h linux-2.6.15-ve025stab014/include/linux/netfilter_ipv4/ip_nat_rule.h +--- linux-2.6.15.orig/include/linux/netfilter_ipv4/ip_nat_rule.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/netfilter_ipv4/ip_nat_rule.h 2006-01-27 14:48:08.000000000 +0300 +@@ -6,7 +6,7 @@ + + #ifdef __KERNEL__ + +-extern int ip_nat_rule_init(void) __init; ++extern int ip_nat_rule_init(void); + extern void ip_nat_rule_cleanup(void); + extern int ip_nat_rule_find(struct sk_buff **pskb, + unsigned int hooknum, +diff -uprN linux-2.6.15.orig/include/linux/netfilter_ipv4/ip_tables.h linux-2.6.15-ve025stab014/include/linux/netfilter_ipv4/ip_tables.h +--- linux-2.6.15.orig/include/linux/netfilter_ipv4/ip_tables.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/netfilter_ipv4/ip_tables.h 2006-01-27 14:48:08.000000000 +0300 +@@ -430,9 +430,15 @@ struct ipt_target + extern int ipt_register_target(struct ipt_target *target); + extern void ipt_unregister_target(struct ipt_target *target); + ++extern int virt_ipt_register_target(struct ipt_target *target); ++extern void virt_ipt_unregister_target(struct ipt_target *target); ++ + extern int ipt_register_match(struct ipt_match *match); + extern void ipt_unregister_match(struct ipt_match *match); + ++extern int virt_ipt_register_match(struct ipt_match *match); ++extern void virt_ipt_unregister_match(struct ipt_match *match); ++ + /* Furniture shopping... */ + struct ipt_table + { +@@ -486,6 +492,10 @@ extern unsigned int ipt_do_table(struct + struct ipt_table *table, + void *userdata); + ++extern struct ipt_table *virt_ipt_register_table(struct ipt_table *table, ++ const struct ipt_replace *repl); ++extern void virt_ipt_unregister_table(struct ipt_table *table); ++ + #define IPT_ALIGN(s) (((s) + (__alignof__(struct ipt_entry)-1)) & ~(__alignof__(struct ipt_entry)-1)) + #endif /*__KERNEL__*/ + #endif /* _IPTABLES_H */ +diff -uprN linux-2.6.15.orig/include/linux/netlink.h linux-2.6.15-ve025stab014/include/linux/netlink.h +--- linux-2.6.15.orig/include/linux/netlink.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/netlink.h 2006-01-27 14:48:05.000000000 +0300 +@@ -160,7 +160,8 @@ extern int netlink_unregister_notifier(s + + /* finegrained unicast helpers: */ + struct sock *netlink_getsockbyfilp(struct file *filp); +-int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long timeo); ++int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, ++ long timeo, struct sock *ssk); + void netlink_detachskb(struct sock *sk, struct sk_buff *skb); + int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol); + +diff -uprN linux-2.6.15.orig/include/linux/nfcalls.h linux-2.6.15-ve025stab014/include/linux/nfcalls.h +--- linux-2.6.15.orig/include/linux/nfcalls.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/nfcalls.h 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,210 @@ ++/* ++ * include/linux/nfcalls.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _LINUX_NFCALLS_H ++#define _LINUX_NFCALLS_H ++ ++#include <linux/rcupdate.h> ++ ++extern struct module no_module; ++ ++#define DECL_KSYM_MODULE(name) \ ++ extern struct module *vz_mod_##name ++#define DECL_KSYM_CALL(type, name, args) \ ++ extern type (*vz_##name) args ++ ++#define INIT_KSYM_MODULE(name) \ ++ struct module *vz_mod_##name = &no_module; \ ++ EXPORT_SYMBOL(vz_mod_##name) ++#define INIT_KSYM_CALL(type, name, args) \ ++ type (*vz_##name) args; \ ++ EXPORT_SYMBOL(vz_##name) ++ ++#define __KSYMERRCALL(err, type, mod, name, args) \ ++({ \ ++ type ret = (type)err; \ ++ if (!__vzksym_module_get(vz_mod_##mod)) { \ ++ if (vz_##name) \ ++ ret = ((*vz_##name)args); \ ++ __vzksym_module_put(vz_mod_##mod); \ ++ } \ ++ ret; \ ++}) ++#define __KSYMSAFECALL_VOID(mod, name, args) \ ++do { \ ++ if (!__vzksym_module_get(vz_mod_##mod)) { \ ++ if (vz_##name) \ ++ ((*vz_##name)args); \ ++ __vzksym_module_put(vz_mod_##mod); \ ++ } \ ++} while (0) ++ ++#define KSYMERRCALL(err, mod, name, args) \ ++ __KSYMERRCALL(err, int, mod, name, args) ++#define KSYMSAFECALL(type, mod, name, args) \ ++ __KSYMERRCALL(0, type, mod, name, args) ++#define KSYMSAFECALL_VOID(mod, name, args) \ ++ __KSYMSAFECALL_VOID(mod, name, args) ++ ++#ifdef CONFIG_VE ++/* should be called _after_ KSYMRESOLVE's */ ++#define KSYMMODRESOLVE(name) \ ++ __vzksym_modresolve(&vz_mod_##name, THIS_MODULE) ++#define KSYMMODUNRESOLVE(name) \ ++ __vzksym_modunresolve(&vz_mod_##name) ++ ++#define KSYMRESOLVE(name) \ ++ vz_##name = &name ++#define KSYMUNRESOLVE(name) \ ++ vz_##name = NULL ++#else ++#define KSYMRESOLVE(name) do { } while (0) ++#define KSYMUNRESOLVE(name) do { } while (0) ++#define KSYMMODRESOLVE(name) do { } while (0) ++#define KSYMMODUNRESOLVE(name) do { } while (0) ++#endif ++ ++static inline void __vzksym_modresolve(struct module **modp, struct module *mod) ++{ ++ /* ++ * we want to be sure, that pointer updates are visible first: ++ * 1. wmb() is here only for piece of sure ++ * (note, no rmb() in KSYMSAFECALL) ++ * 2. synchronize_sched() guarantees that updates are visible ++ * on all cpus and allows us to remove rmb() in KSYMSAFECALL ++ */ ++ wmb(); synchronize_sched(); ++ *modp = mod; ++ /* just to be sure, our changes are visible as soon as possible */ ++ wmb(); synchronize_sched(); ++} ++ ++static inline void __vzksym_modunresolve(struct module **modp) ++{ ++ /* ++ * try_module_get() in KSYMSAFECALL should fail at this moment since ++ * THIS_MODULE in in unloading state (we should be called from fini), ++ * no need to syncronize pointers/ve_module updates. ++ */ ++ *modp = &no_module; ++ /* ++ * synchronize_sched() guarantees here that we see ++ * updated module pointer before the module really gets away ++ */ ++ synchronize_sched(); ++} ++ ++static inline int __vzksym_module_get(struct module *mod) ++{ ++ /* ++ * we want to avoid rmb(), so use synchronize_sched() in KSYMUNRESOLVE ++ * and smp_read_barrier_depends() here... ++ */ ++ smp_read_barrier_depends(); /* for module loading */ ++ if (!try_module_get(mod)) ++ return -EBUSY; ++ ++ return 0; ++} ++ ++static inline void __vzksym_module_put(struct module *mod) ++{ ++ module_put(mod); ++} ++ ++#if defined(CONFIG_VE_IPTABLES) ++DECL_KSYM_MODULE(ip_tables); ++DECL_KSYM_MODULE(iptable_filter); ++DECL_KSYM_MODULE(iptable_mangle); ++DECL_KSYM_MODULE(ipt_limit); ++DECL_KSYM_MODULE(ipt_multiport); ++DECL_KSYM_MODULE(ipt_tos); ++DECL_KSYM_MODULE(ipt_TOS); ++DECL_KSYM_MODULE(ipt_REJECT); ++DECL_KSYM_MODULE(ipt_TCPMSS); ++DECL_KSYM_MODULE(ipt_tcpmss); ++DECL_KSYM_MODULE(ipt_ttl); ++DECL_KSYM_MODULE(ipt_LOG); ++DECL_KSYM_MODULE(ipt_length); ++DECL_KSYM_MODULE(ip_conntrack); ++DECL_KSYM_MODULE(ip_conntrack_ftp); ++DECL_KSYM_MODULE(ip_conntrack_irc); ++DECL_KSYM_MODULE(ipt_conntrack); ++DECL_KSYM_MODULE(ipt_state); ++DECL_KSYM_MODULE(ipt_helper); ++DECL_KSYM_MODULE(ip_nat); ++DECL_KSYM_MODULE(iptable_nat); ++DECL_KSYM_MODULE(ip_nat_ftp); ++DECL_KSYM_MODULE(ip_nat_irc); ++ ++struct sk_buff; ++ ++DECL_KSYM_CALL(int, init_netfilter, (void)); ++DECL_KSYM_CALL(int, init_iptables, (void)); ++DECL_KSYM_CALL(int, init_iptable_filter, (void)); ++DECL_KSYM_CALL(int, init_iptable_mangle, (void)); ++DECL_KSYM_CALL(int, init_iptable_limit, (void)); ++DECL_KSYM_CALL(int, init_iptable_multiport, (void)); ++DECL_KSYM_CALL(int, init_iptable_tos, (void)); ++DECL_KSYM_CALL(int, init_iptable_TOS, (void)); ++DECL_KSYM_CALL(int, init_iptable_REJECT, (void)); ++DECL_KSYM_CALL(int, init_iptable_TCPMSS, (void)); ++DECL_KSYM_CALL(int, init_iptable_tcpmss, (void)); ++DECL_KSYM_CALL(int, init_iptable_ttl, (void)); ++DECL_KSYM_CALL(int, init_iptable_LOG, (void)); ++DECL_KSYM_CALL(int, init_iptable_length, (void)); ++DECL_KSYM_CALL(int, init_iptable_conntrack, (void)); ++DECL_KSYM_CALL(int, init_iptable_ftp, (void)); ++DECL_KSYM_CALL(int, init_iptable_irc, (void)); ++DECL_KSYM_CALL(int, init_iptable_conntrack_match, (void)); ++DECL_KSYM_CALL(int, init_iptable_state, (void)); ++DECL_KSYM_CALL(int, init_iptable_helper, (void)); ++DECL_KSYM_CALL(int, ip_nat_init, (void)); ++DECL_KSYM_CALL(int, init_iptable_nat, (void)); ++DECL_KSYM_CALL(int, init_iptable_nat_ftp, (void)); ++DECL_KSYM_CALL(int, init_iptable_nat_irc, (void)); ++DECL_KSYM_CALL(void, fini_iptable_nat_irc, (void)); ++DECL_KSYM_CALL(void, fini_iptable_nat_ftp, (void)); ++DECL_KSYM_CALL(void, fini_iptable_nat, (void)); ++DECL_KSYM_CALL(void, ip_nat_cleanup, (void)); ++DECL_KSYM_CALL(void, fini_iptable_helper, (void)); ++DECL_KSYM_CALL(void, fini_iptable_state, (void)); ++DECL_KSYM_CALL(void, fini_iptable_conntrack_match, (void)); ++DECL_KSYM_CALL(void, fini_iptable_irc, (void)); ++DECL_KSYM_CALL(void, fini_iptable_ftp, (void)); ++DECL_KSYM_CALL(void, fini_iptable_conntrack, (void)); ++DECL_KSYM_CALL(void, fini_iptable_length, (void)); ++DECL_KSYM_CALL(void, fini_iptable_LOG, (void)); ++DECL_KSYM_CALL(void, fini_iptable_ttl, (void)); ++DECL_KSYM_CALL(void, fini_iptable_tcpmss, (void)); ++DECL_KSYM_CALL(void, fini_iptable_TCPMSS, (void)); ++DECL_KSYM_CALL(void, fini_iptable_REJECT, (void)); ++DECL_KSYM_CALL(void, fini_iptable_TOS, (void)); ++DECL_KSYM_CALL(void, fini_iptable_tos, (void)); ++DECL_KSYM_CALL(void, fini_iptable_multiport, (void)); ++DECL_KSYM_CALL(void, fini_iptable_limit, (void)); ++DECL_KSYM_CALL(void, fini_iptable_filter, (void)); ++DECL_KSYM_CALL(void, fini_iptable_mangle, (void)); ++DECL_KSYM_CALL(void, fini_iptables, (void)); ++DECL_KSYM_CALL(void, fini_netfilter, (void)); ++ ++DECL_KSYM_CALL(void, ipt_flush_table, (struct ipt_table *table)); ++#endif /* CONFIG_VE_IPTABLES */ ++ ++#ifdef CONFIG_VE_CALLS_MODULE ++DECL_KSYM_MODULE(vzmon); ++DECL_KSYM_CALL(int, real_get_device_perms_ve, ++ (int dev_type, dev_t dev, int access_mode)); ++DECL_KSYM_CALL(void, real_do_env_cleanup, (struct ve_struct *env)); ++DECL_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env)); ++DECL_KSYM_CALL(void, real_update_load_avg_ve, (void)); ++#endif ++ ++#endif /* _LINUX_NFCALLS_H */ +diff -uprN linux-2.6.15.orig/include/linux/notifier.h linux-2.6.15-ve025stab014/include/linux/notifier.h +--- linux-2.6.15.orig/include/linux/notifier.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/notifier.h 2006-01-27 14:48:06.000000000 +0300 +@@ -27,8 +27,9 @@ extern int notifier_call_chain(struct no + + #define NOTIFY_DONE 0x0000 /* Don't care */ + #define NOTIFY_OK 0x0001 /* Suits me */ ++#define NOTIFY_FAIL 0x0002 /* Reject */ + #define NOTIFY_STOP_MASK 0x8000 /* Don't call further */ +-#define NOTIFY_BAD (NOTIFY_STOP_MASK|0x0002) /* Bad/Veto action */ ++#define NOTIFY_BAD (NOTIFY_STOP_MASK|NOTIFY_FAIL) /* Bad/Veto action */ + /* + * Clean way to return from the notifier and stop further calls. + */ +diff -uprN linux-2.6.15.orig/include/linux/pid.h linux-2.6.15-ve025stab014/include/linux/pid.h +--- linux-2.6.15.orig/include/linux/pid.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/pid.h 2006-01-27 14:48:08.000000000 +0300 +@@ -1,6 +1,18 @@ + #ifndef _LINUX_PID_H + #define _LINUX_PID_H + ++#define VPID_BIT 10 ++#define VPID_DIV (1<<VPID_BIT) ++ ++#ifdef CONFIG_VE ++#define __is_virtual_pid(pid) ((pid) & VPID_DIV) ++#define is_virtual_pid(pid) \ ++ (__is_virtual_pid(pid) || ((pid)==1 && !ve_is_super(get_exec_env()))) ++#else ++#define __is_virtual_pid(pid) 0 ++#define is_virtual_pid(pid) 0 ++#endif ++ + enum pid_type + { + PIDTYPE_PID, +@@ -15,6 +27,9 @@ struct pid + /* Try to keep pid_chain in the same cacheline as nr for find_pid */ + int nr; + struct hlist_node pid_chain; ++#ifdef CONFIG_VE ++ int vnr; ++#endif + /* list of pids with the same nr, only one of them is in the hash */ + struct list_head pid_list; + }; +@@ -40,16 +55,89 @@ extern int alloc_pidmap(void); + extern void FASTCALL(free_pidmap(int)); + extern void switch_exec_pids(struct task_struct *leader, struct task_struct *thread); + +-#define do_each_task_pid(who, type, task) \ +- if ((task = find_task_by_pid_type(type, who))) { \ ++#ifndef CONFIG_VE ++ ++#define vpid_to_pid(pid) (pid) ++#define __vpid_to_pid(pid) (pid) ++#define pid_type_to_vpid(type, pid) (pid) ++#define __pid_type_to_vpid(type, pid) (pid) ++ ++#define comb_vpid_to_pid(pid) (pid) ++#define comb_pid_to_vpid(pid) (pid) ++ ++#else ++ ++struct ve_struct; ++extern void free_vpid(int vpid, struct ve_struct *ve); ++extern int alloc_vpid(int pid, int vpid); ++extern int vpid_to_pid(int pid); ++extern int __vpid_to_pid(int pid); ++extern pid_t pid_type_to_vpid(int type, pid_t pid); ++extern pid_t _pid_type_to_vpid(int type, pid_t pid); ++ ++static inline int comb_vpid_to_pid(int vpid) ++{ ++ int pid = vpid; ++ ++ if (vpid > 0) { ++ pid = vpid_to_pid(vpid); ++ if (unlikely(pid < 0)) ++ return 0; ++ } else if (vpid < 0) { ++ pid = vpid_to_pid(-vpid); ++ if (unlikely(pid < 0)) ++ return 0; ++ pid = -pid; ++ } ++ return pid; ++} ++ ++static inline int comb_pid_to_vpid(int pid) ++{ ++ int vpid = pid; ++ ++ if (pid > 0) { ++ vpid = pid_type_to_vpid(PIDTYPE_PID, pid); ++ if (unlikely(vpid < 0)) ++ return 0; ++ } else if (pid < 0) { ++ vpid = pid_type_to_vpid(PIDTYPE_PGID, -pid); ++ if (unlikely(vpid < 0)) ++ return 0; ++ vpid = -vpid; ++ } ++ return vpid; ++} ++#endif ++ ++#define do_each_task_pid_all(who, type, task) \ ++ if ((task = find_task_by_pid_type_all(type, who))) { \ + prefetch((task)->pids[type].pid_list.next); \ + do { + +-#define while_each_task_pid(who, type, task) \ ++#define while_each_task_pid_all(who, type, task) \ + } while (task = pid_task((task)->pids[type].pid_list.next,\ + type), \ + prefetch((task)->pids[type].pid_list.next), \ + hlist_unhashed(&(task)->pids[type].pid_chain)); \ + } \ + ++#ifndef CONFIG_VE ++#define __do_each_task_pid_ve(who, type, task, owner) \ ++ do_each_task_pid_all(who, type, task) ++#define __while_each_task_pid_ve(who, type, task, owner) \ ++ while_each_task_pid_all(who, type, task) ++#else /* CONFIG_VE */ ++#define __do_each_task_pid_ve(who, type, task, owner) \ ++ do_each_task_pid_all(who, type, task) \ ++ if (ve_accessible(VE_TASK_INFO(task)->owner_env, owner)) ++#define __while_each_task_pid_ve(who, type, task, owner) \ ++ while_each_task_pid_all(who, type, task) ++#endif /* CONFIG_VE */ ++ ++#define do_each_task_pid_ve(who, type, task) \ ++ __do_each_task_pid_ve(who, type, task, get_exec_env()); ++#define while_each_task_pid_ve(who, type, task) \ ++ __while_each_task_pid_ve(who, type, task, get_exec_env()); ++ + #endif /* _LINUX_PID_H */ +diff -uprN linux-2.6.15.orig/include/linux/proc_fs.h linux-2.6.15-ve025stab014/include/linux/proc_fs.h +--- linux-2.6.15.orig/include/linux/proc_fs.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/proc_fs.h 2006-01-27 14:48:08.000000000 +0300 +@@ -86,8 +86,14 @@ struct vmcore { + + extern struct proc_dir_entry proc_root; + extern struct proc_dir_entry *proc_root_fs; ++#ifdef CONFIG_VE ++#include <linux/sched.h> ++#define proc_net (get_exec_env()->_proc_net) ++#define proc_net_stat (get_exec_env()->_proc_net_stat) ++#else + extern struct proc_dir_entry *proc_net; + extern struct proc_dir_entry *proc_net_stat; ++#endif + extern struct proc_dir_entry *proc_bus; + extern struct proc_dir_entry *proc_root_driver; + extern struct proc_dir_entry *proc_root_kcore; +@@ -98,8 +104,8 @@ extern void proc_misc_init(void); + struct mm_struct; + + struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *); +-struct dentry *proc_pid_unhash(struct task_struct *p); +-void proc_pid_flush(struct dentry *proc_dentry); ++void proc_pid_unhash(struct task_struct *p, struct dentry * [2]); ++void proc_pid_flush(struct dentry *proc_dentry[2]); + int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); + unsigned long task_vsize(struct mm_struct *); + int task_statm(struct mm_struct *, int *, int *, int *, int *); +@@ -107,7 +113,11 @@ char *task_mem(struct mm_struct *, char + + extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, + struct proc_dir_entry *parent); ++extern struct proc_dir_entry *create_proc_glob_entry(const char *name, ++ mode_t mode, ++ struct proc_dir_entry *parent); + extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent); ++extern void remove_proc_glob_entry(const char *name, struct proc_dir_entry *parent); + + extern struct vfsmount *proc_mnt; + extern int proc_fill_super(struct super_block *,void *,int); +@@ -189,6 +199,15 @@ static inline struct proc_dir_entry *pro + return res; + } + ++static inline struct proc_dir_entry *proc_glob_fops_create(const char *name, ++ mode_t mode, struct file_operations *fops) ++{ ++ struct proc_dir_entry *res = create_proc_glob_entry(name, mode, NULL); ++ if (res) ++ res->proc_fops = fops; ++ return res; ++} ++ + static inline void proc_net_remove(const char *name) + { + remove_proc_entry(name,proc_net); +@@ -201,16 +220,21 @@ static inline void proc_net_remove(const + #define proc_bus NULL + + #define proc_net_fops_create(name, mode, fops) ({ (void)(mode), NULL; }) ++#define proc_glob_fops_create(name, mode, fops) ({ (void)(mode), NULL; }) + #define proc_net_create(name, mode, info) ({ (void)(mode), NULL; }) + static inline void proc_net_remove(const char *name) {} + +-static inline struct dentry *proc_pid_unhash(struct task_struct *p) { return NULL; } +-static inline void proc_pid_flush(struct dentry *proc_dentry) { } ++static inline struct dentry *proc_pid_unhash(struct task_struct *p, ++ struct dentry *d[2]) { return NULL; } ++static inline void proc_pid_flush(struct dentry *proc_dentry[2]) { } + + static inline struct proc_dir_entry *create_proc_entry(const char *name, + mode_t mode, struct proc_dir_entry *parent) { return NULL; } ++static inline struct proc_dir_entry *create_proc_glob_entry(const char *name, ++ mode_t mode, struct proc_dir_entry *parent) { return NULL; } + + #define remove_proc_entry(name, parent) do {} while (0) ++#define remove_proc_glob_entry(name, parent) do {} while (0) + + static inline struct proc_dir_entry *proc_symlink(const char *name, + struct proc_dir_entry *parent,const char *dest) {return NULL;} +@@ -261,4 +285,18 @@ static inline struct proc_dir_entry *PDE + return PROC_I(inode)->pde; + } + ++static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de) ++{ ++ if (de) ++ atomic_inc(&de->count); ++ return de; ++} ++ ++extern void de_put(struct proc_dir_entry *); ++ ++#define LPDE(inode) (PROC_I((inode))->pde) ++#ifdef CONFIG_VE ++#define GPDE(inode) (*(struct proc_dir_entry **)(&(inode)->i_pipe)) ++#endif ++ + #endif /* _LINUX_PROC_FS_H */ +diff -uprN linux-2.6.15.orig/include/linux/quota.h linux-2.6.15-ve025stab014/include/linux/quota.h +--- linux-2.6.15.orig/include/linux/quota.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/quota.h 2006-01-27 14:48:09.000000000 +0300 +@@ -37,7 +37,6 @@ + + #include <linux/errno.h> + #include <linux/types.h> +-#include <linux/spinlock.h> + + #define __DQUOT_VERSION__ "dquot_6.5.1" + #define __DQUOT_NUM_VERSION__ 6*10000+5*100+1 +@@ -45,8 +44,6 @@ + typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */ + typedef __u64 qsize_t; /* Type in which we store sizes */ + +-extern spinlock_t dq_data_lock; +- + /* Size of blocks in which are counted size limits */ + #define QUOTABLOCK_BITS 10 + #define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS) +@@ -133,6 +130,10 @@ struct if_dqinfo { + + #ifdef __KERNEL__ + ++#include <linux/spinlock.h> ++ ++extern spinlock_t dq_data_lock; ++ + #include <linux/dqblk_xfs.h> + #include <linux/dqblk_v1.h> + #include <linux/dqblk_v2.h> +@@ -242,6 +243,8 @@ struct quota_format_ops { + int (*release_dqblk)(struct dquot *dquot); /* Called when last reference to dquot is being dropped */ + }; + ++struct inode; ++struct iattr; + /* Operations working with dquots */ + struct dquot_operations { + int (*initialize) (struct inode *, int); +@@ -256,9 +259,11 @@ struct dquot_operations { + int (*release_dquot) (struct dquot *); /* Quota is going to be deleted from disk */ + int (*mark_dirty) (struct dquot *); /* Dquot is marked dirty */ + int (*write_info) (struct super_block *, int); /* Write of quota "superblock" */ ++ int (*rename) (struct inode *, struct inode *, struct inode *); + }; + + /* Operations handling requests from userspace */ ++struct v2_disk_dqblk; + struct quotactl_ops { + int (*quota_on)(struct super_block *, int, int, char *); + int (*quota_off)(struct super_block *, int); +@@ -271,6 +276,9 @@ struct quotactl_ops { + int (*set_xstate)(struct super_block *, unsigned int, int); + int (*get_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); + int (*set_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); ++#ifdef CONFIG_QUOTA_COMPAT ++ int (*get_quoti)(struct super_block *, int, unsigned int, struct v2_disk_dqblk *); ++#endif + }; + + struct quota_format_type { +@@ -291,6 +299,10 @@ struct quota_info { + struct inode *files[MAXQUOTAS]; /* inodes of quotafiles */ + struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ + struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ ++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) ++ struct vz_quota_master *vzdq_master; ++ int vzdq_count; ++#endif + }; + + /* Inline would be better but we need to dereference super_block which is not defined yet */ +diff -uprN linux-2.6.15.orig/include/linux/quotaops.h linux-2.6.15-ve025stab014/include/linux/quotaops.h +--- linux-2.6.15.orig/include/linux/quotaops.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/quotaops.h 2006-01-27 14:48:08.000000000 +0300 +@@ -171,6 +171,19 @@ static __inline__ int DQUOT_TRANSFER(str + return 0; + } + ++static __inline__ int DQUOT_RENAME(struct inode *inode, ++ struct inode *old_dir, struct inode *new_dir) ++{ ++ struct dquot_operations *q_op; ++ ++ q_op = inode->i_sb->dq_op; ++ if (q_op && q_op->rename) { ++ if (q_op->rename(inode, old_dir, new_dir) == NO_QUOTA) ++ return 1; ++ } ++ return 0; ++} ++ + /* The following two functions cannot be called inside a transaction */ + #define DQUOT_SYNC(sb) sync_dquots(sb, -1) + +@@ -198,6 +211,7 @@ static __inline__ int DQUOT_OFF(struct s + #define DQUOT_SYNC(sb) do { } while(0) + #define DQUOT_OFF(sb) do { } while(0) + #define DQUOT_TRANSFER(inode, iattr) (0) ++#define DQUOT_RENAME(inode, old_dir, new_dir) (0) + static inline int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) + { + inode_add_bytes(inode, nr); +diff -uprN linux-2.6.15.orig/include/linux/sched.h linux-2.6.15-ve025stab014/include/linux/sched.h +--- linux-2.6.15.orig/include/linux/sched.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/sched.h 2006-01-27 14:48:09.000000000 +0300 +@@ -37,7 +37,10 @@ + + #include <linux/auxvec.h> /* For AT_VECTOR_SIZE */ + ++#include <ub/ub_task.h> ++ + struct exec_domain; ++struct ve_struct; + + /* + * cloning flags: +@@ -91,15 +94,34 @@ extern unsigned long avenrun[]; /* Load + load += n*(FIXED_1-exp); \ + load >>= FSHIFT; + ++#define LOAD_INT(x) ((x) >> FSHIFT) ++#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) ++ + extern unsigned long total_forks; + extern int nr_threads; + extern int last_pid; + DECLARE_PER_CPU(unsigned long, process_counts); + extern int nr_processes(void); ++ ++extern unsigned long nr_sleeping(void); ++extern unsigned long nr_stopped(void); ++extern unsigned long nr_zombie; ++extern unsigned long nr_dead; + extern unsigned long nr_running(void); + extern unsigned long nr_uninterruptible(void); + extern unsigned long nr_iowait(void); + ++#ifdef CONFIG_VE ++struct ve_struct; ++extern unsigned long nr_running_ve(struct ve_struct *); ++extern unsigned long nr_iowait_ve(struct ve_struct *); ++extern unsigned long nr_uninterruptible_ve(struct ve_struct *); ++#else ++#define nr_running_ve(ve) 0 ++#define nr_iowait_ve(ve) 0 ++#define nr_uninterruptible_ve(ve) 0 ++#endif ++ + #include <linux/time.h> + #include <linux/param.h> + #include <linux/resource.h> +@@ -249,44 +271,7 @@ arch_get_unmapped_area_topdown(struct fi + extern void arch_unmap_area(struct mm_struct *, unsigned long); + extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); + +-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +-/* +- * The mm counters are not protected by its page_table_lock, +- * so must be incremented atomically. +- */ +-#ifdef ATOMIC64_INIT +-#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value) +-#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member)) +-#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member) +-#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member) +-#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member) +-typedef atomic64_t mm_counter_t; +-#else /* !ATOMIC64_INIT */ +-/* +- * The counters wrap back to 0 at 2^32 * PAGE_SIZE, +- * that is, at 16TB if using 4kB page size. +- */ +-#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value) +-#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member)) +-#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member) +-#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member) +-#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member) +-typedef atomic_t mm_counter_t; +-#endif /* !ATOMIC64_INIT */ +- +-#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ +-/* +- * The mm counters are protected by its page_table_lock, +- * so can be incremented directly. +- */ +-#define set_mm_counter(mm, member, value) (mm)->_##member = (value) +-#define get_mm_counter(mm, member) ((mm)->_##member) +-#define add_mm_counter(mm, member, value) (mm)->_##member += (value) +-#define inc_mm_counter(mm, member) (mm)->_##member++ +-#define dec_mm_counter(mm, member) (mm)->_##member-- +-typedef unsigned long mm_counter_t; +- +-#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ ++#include <linux/mm_counter.h> + + #define get_mm_rss(mm) \ + (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) +@@ -341,6 +326,7 @@ struct mm_struct { + unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ + + unsigned dumpable:2; ++ unsigned vps_dumpable:1; + cpumask_t cpu_vm_mask; + + /* Architecture-specific MM context */ +@@ -357,6 +343,8 @@ struct mm_struct { + /* aio bits */ + rwlock_t ioctx_list_lock; + struct kioctx *ioctx_list; ++ ++ struct user_beancounter *mm_ub; + }; + + struct sighand_struct { +@@ -365,6 +353,9 @@ struct sighand_struct { + spinlock_t siglock; + }; + ++#include <linux/ve.h> ++#include <linux/ve_task.h> ++ + /* + * NOTE! "signal_struct" does not have it's own + * locking, because a shared signal_struct always +@@ -857,6 +848,16 @@ struct task_struct { + int cpuset_mems_generation; + #endif + atomic_t fs_excl; /* holding fs exclusive resources */ ++#ifdef CONFIG_USER_RESOURCE ++ struct task_beancounter task_bc; ++#endif ++#ifdef CONFIG_VE ++ struct ve_task_info ve_task_info; ++#endif ++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) ++ unsigned long magic; ++ struct inode *ino; ++#endif + }; + + static inline pid_t process_group(struct task_struct *tsk) +@@ -948,6 +949,21 @@ static inline int set_cpus_allowed(task_ + extern unsigned long long sched_clock(void); + extern unsigned long long current_sched_time(const task_t *current_task); + ++static inline unsigned long cycles_to_clocks(cycles_t cycles) ++{ ++ extern unsigned long cycles_per_clock; ++ do_div(cycles, cycles_per_clock); ++ return cycles; ++} ++ ++static inline u64 cycles_to_jiffies(cycles_t cycles) ++{ ++ extern unsigned long cycles_per_jiffy; ++ do_div(cycles, cycles_per_jiffy); ++ return cycles; ++} ++ ++ + /* sched_exec is called by processes performing an exec */ + #ifdef CONFIG_SMP + extern void sched_exec(void); +@@ -1000,12 +1016,227 @@ extern struct task_struct init_task; + + extern struct mm_struct init_mm; + +-#define find_task_by_pid(nr) find_task_by_pid_type(PIDTYPE_PID, nr) +-extern struct task_struct *find_task_by_pid_type(int type, int pid); ++#define find_task_by_pid_all(nr) \ ++ find_task_by_pid_type_all(PIDTYPE_PID, nr) ++extern struct task_struct *find_task_by_pid_type_all(int type, int pid); + extern void set_special_pids(pid_t session, pid_t pgrp); + extern void __set_special_pids(pid_t session, pid_t pgrp); + ++#ifndef CONFIG_VE ++#define find_task_by_pid_ve find_task_by_pid_all ++ ++#define get_exec_env() ((struct ve_struct *)NULL) ++#define set_exec_env(new_env) ((struct ve_struct *)NULL) ++ ++#define ve_is_super(env) 1 ++#define ve_accessible(target, owner) 1 ++#define ve_accessible_strict(target, owner) 1 ++#define ve_accessible_veid(target, owner) 1 ++#define ve_accessible_strict_veid(target, owner) 1 ++ ++#define VEID(envid) 0 ++#define get_ve0() NULL ++ ++static inline pid_t virt_pid(struct task_struct *tsk) ++{ ++ return tsk->pid; ++} ++ ++static inline pid_t virt_tgid(struct task_struct *tsk) ++{ ++ return tsk->tgid; ++} ++ ++static inline pid_t virt_pgid(struct task_struct *tsk) ++{ ++ return tsk->signal->pgrp; ++} ++ ++static inline pid_t virt_sid(struct task_struct *tsk) ++{ ++ return tsk->signal->session; ++} ++ ++#define get_task_pid_ve(tsk, ve) get_task_pid(tsk) ++ ++static inline pid_t get_task_pid(struct task_struct *tsk) ++{ ++ return tsk->pid; ++} ++ ++static inline pid_t get_task_tgid(struct task_struct *tsk) ++{ ++ return tsk->tgid; ++} ++ ++static inline pid_t get_task_pgid(struct task_struct *tsk) ++{ ++ return tsk->signal->pgrp; ++} ++ ++static inline pid_t get_task_sid(struct task_struct *tsk) ++{ ++ return tsk->signal->session; ++} ++ ++static inline void set_virt_pid(struct task_struct *tsk, pid_t pid) ++{ ++} ++ ++static inline void set_virt_tgid(struct task_struct *tsk, pid_t pid) ++{ ++} ++ ++static inline void set_virt_pgid(struct task_struct *tsk, pid_t pid) ++{ ++} ++ ++static inline void set_virt_sid(struct task_struct *tsk, pid_t pid) ++{ ++} ++ ++static inline pid_t get_task_ppid(struct task_struct *p) ++{ ++ return pid_alive(p) ? p->group_leader->real_parent->tgid : 0; ++} ++ ++#else /* CONFIG_VE */ ++ ++#include <asm/current.h> ++#include <linux/ve.h> ++ ++extern struct ve_struct ve0; ++ ++#define find_task_by_pid_ve(nr) \ ++ find_task_by_pid_type_ve(PIDTYPE_PID, nr) ++ ++extern struct task_struct *find_task_by_pid_type_ve(int type, int pid); ++ ++#define get_ve0() (&ve0) ++#define VEID(envid) ((envid)->veid) ++ ++#define get_exec_env() (VE_TASK_INFO(current)->exec_env) ++static inline struct ve_struct *set_exec_env(struct ve_struct *new_env) ++{ ++ struct ve_struct *old_env; ++ ++ old_env = VE_TASK_INFO(current)->exec_env; ++ VE_TASK_INFO(current)->exec_env = new_env; ++ ++ return old_env; ++} ++ ++#define ve_is_super(env) ((env) == get_ve0()) ++#define ve_accessible_strict(target, owner) ((target) == (owner)) ++static inline int ve_accessible(struct ve_struct *target, ++ struct ve_struct *owner) { ++ return ve_is_super(owner) || ve_accessible_strict(target, owner); ++} ++ ++#define ve_accessible_strict_veid(target, owner) ((target) == (owner)) ++static inline int ve_accessible_veid(envid_t target, envid_t owner) ++{ ++ return get_ve0()->veid == owner || ++ ve_accessible_strict_veid(target, owner); ++} ++ ++static inline pid_t virt_pid(struct task_struct *tsk) ++{ ++ return tsk->pids[PIDTYPE_PID].vnr; ++} ++ ++static inline pid_t virt_tgid(struct task_struct *tsk) ++{ ++ return tsk->pids[PIDTYPE_TGID].vnr; ++} ++ ++static inline pid_t virt_pgid(struct task_struct *tsk) ++{ ++ return tsk->pids[PIDTYPE_PGID].vnr; ++} ++ ++static inline pid_t virt_sid(struct task_struct *tsk) ++{ ++ return tsk->pids[PIDTYPE_SID].vnr; ++} ++ ++static inline pid_t get_task_pid_ve(struct task_struct *tsk, struct ve_struct *env) ++{ ++ return ve_is_super(env) ? tsk->pid : virt_pid(tsk); ++} ++ ++static inline pid_t get_task_pid(struct task_struct *tsk) ++{ ++ return get_task_pid_ve(tsk, get_exec_env()); ++} ++ ++static inline pid_t get_task_tgid(struct task_struct *tsk) ++{ ++ return ve_is_super(get_exec_env()) ? tsk->tgid : virt_tgid(tsk); ++} ++ ++static inline pid_t get_task_pgid(struct task_struct *tsk) ++{ ++ return ve_is_super(get_exec_env()) ? tsk->signal->pgrp : virt_pgid(tsk); ++} ++ ++static inline pid_t get_task_sid(struct task_struct *tsk) ++{ ++ return ve_is_super(get_exec_env()) ? tsk->signal->session : virt_sid(tsk); ++} ++ ++static inline void set_virt_pid(struct task_struct *tsk, pid_t pid) ++{ ++ tsk->pids[PIDTYPE_PID].vnr = pid; ++} ++ ++static inline void set_virt_tgid(struct task_struct *tsk, pid_t pid) ++{ ++ tsk->pids[PIDTYPE_TGID].vnr = pid; ++} ++ ++static inline void set_virt_pgid(struct task_struct *tsk, pid_t pid) ++{ ++ tsk->pids[PIDTYPE_PGID].vnr = pid; ++} ++ ++static inline void set_virt_sid(struct task_struct *tsk, pid_t pid) ++{ ++ tsk->pids[PIDTYPE_SID].vnr = pid; ++} ++ ++static inline pid_t get_task_ppid(struct task_struct *p) ++{ ++ struct task_struct *parent; ++ struct ve_struct *env; ++ ++ if (!pid_alive(p)) ++ return 0; ++ env = get_exec_env(); ++ if (get_task_pid_ve(p, env) == 1) ++ return 0; ++ parent = p->group_leader->real_parent; ++ return ve_accessible(VE_TASK_INFO(parent)->owner_env, env) ? ++ get_task_tgid(parent) : 1; ++} ++ ++void ve_sched_get_cpu_stat(struct ve_struct *envid, cycles_t *idle, ++ cycles_t *strv, unsigned int cpu); ++void ve_sched_attach(struct ve_struct *envid); ++ ++#endif /* CONFIG_VE */ ++ ++ ++#ifdef CONFIG_VE ++extern cycles_t ve_sched_get_idle_time(struct ve_struct *, int); ++extern cycles_t ve_sched_get_iowait_time(struct ve_struct *, int); ++#else ++#define ve_sched_get_idle_time(ve, cpu) 0 ++#define ve_sched_get_iowait_time(ve, cpu) 0 ++#endif ++ + /* per-UID process charging. */ ++extern int set_user(uid_t new_ruid, int dumpclear); + extern struct user_struct * alloc_uid(uid_t); + static inline struct user_struct *get_uid(struct user_struct *u) + { +@@ -1182,22 +1413,100 @@ extern void wait_task_inactive(task_t * + add_parent(p, (p)->parent); \ + } while (0) + +-#define next_task(p) list_entry((p)->tasks.next, struct task_struct, tasks) +-#define prev_task(p) list_entry((p)->tasks.prev, struct task_struct, tasks) ++#define next_task_all(p) list_entry((p)->tasks.next, struct task_struct, tasks) ++#define prev_task_all(p) list_entry((p)->tasks.prev, struct task_struct, tasks) + +-#define for_each_process(p) \ +- for (p = &init_task ; (p = next_task(p)) != &init_task ; ) ++#define for_each_process_all(p) \ ++ for (p = &init_task ; (p = next_task_all(p)) != &init_task ; ) + + /* + * Careful: do_each_thread/while_each_thread is a double loop so + * 'break' will not work as expected - use goto instead. + */ +-#define do_each_thread(g, t) \ +- for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do ++#define do_each_thread_all(g, t) \ ++ for (g = t = &init_task ; (g = t = next_task_all(g)) != &init_task ; ) do ++ ++#define while_each_thread_all(g, t) \ ++ while ((t = next_thread(t)) != g) ++ ++#ifndef CONFIG_VE ++ ++#define SET_VE_LINKS(p) ++#define REMOVE_VE_LINKS(p) ++#define for_each_process_ve(p) for_each_process_all(p) ++#define do_each_thread_ve(g, t) do_each_thread_all(g, t) ++#define while_each_thread_ve(g, t) while_each_thread_all(g, t) ++#define first_task_ve() next_task_ve(&init_task) ++#define __first_task_ve(owner) next_task_ve(&init_task) ++#define __next_task_ve(owner, p) next_task_ve(p) ++#define next_task_ve(p) \ ++ (next_task_all(p) != &init_task ? next_task_all(p) : NULL) ++ ++#else /* CONFIG_VE */ ++ ++#define SET_VE_LINKS(p) \ ++ do { \ ++ if (thread_group_leader(p)) \ ++ list_add_tail(&VE_TASK_INFO(p)->vetask_list, \ ++ &VE_TASK_INFO(p)->owner_env->vetask_lh); \ ++ } while (0) ++ ++#define REMOVE_VE_LINKS(p) \ ++ do { \ ++ if (thread_group_leader(p)) \ ++ list_del(&VE_TASK_INFO(p)->vetask_list); \ ++ } while(0) ++ ++static inline task_t* __first_task_ve(struct ve_struct *ve) ++{ ++ task_t *tsk; ++ ++ if (unlikely(ve_is_super(ve))) { ++ tsk = next_task_all(&init_task); ++ if (tsk == &init_task) ++ tsk = NULL; ++ } else { ++ /* probably can return ve->init_entry, but it's more clear */ ++ BUG_ON(list_empty(&ve->vetask_lh)); ++ tsk = VE_TASK_LIST_2_TASK(ve->vetask_lh.next); ++ } ++ return tsk; ++} ++ ++static inline task_t* __next_task_ve(struct ve_struct *ve, task_t *tsk) ++{ ++ if (unlikely(ve_is_super(ve))) { ++ tsk = next_task_all(tsk); ++ if (tsk == &init_task) ++ tsk = NULL; ++ } else { ++ struct list_head *tmp; ++ ++ BUG_ON(VE_TASK_INFO(tsk)->owner_env != ve); ++ tmp = VE_TASK_INFO(tsk)->vetask_list.next; ++ if (tmp == &ve->vetask_lh) ++ tsk = NULL; ++ else ++ tsk = VE_TASK_LIST_2_TASK(tmp); ++ } ++ return tsk; ++} + +-#define while_each_thread(g, t) \ ++#define first_task_ve() __first_task_ve(get_exec_env()) ++#define next_task_ve(p) __next_task_ve(get_exec_env(), p) ++/* no one uses prev_task_ve(), copy next_task_ve() if needed */ ++ ++#define for_each_process_ve(p) \ ++ for (p = first_task_ve(); p != NULL ; p = next_task_ve(p)) ++ ++#define do_each_thread_ve(g, t) \ ++ for (g = t = first_task_ve() ; g != NULL; g = t = next_task_ve(g)) do ++ ++#define while_each_thread_ve(g, t) \ + while ((t = next_thread(t)) != g) + ++#endif /* CONFIG_VE */ ++ + extern task_t * FASTCALL(next_thread(const task_t *p)); + + #define thread_group_leader(p) (p->pid == p->tgid) +diff -uprN linux-2.6.15.orig/include/linux/shm.h linux-2.6.15-ve025stab014/include/linux/shm.h +--- linux-2.6.15.orig/include/linux/shm.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/shm.h 2006-01-27 14:48:08.000000000 +0300 +@@ -86,6 +86,7 @@ struct shmid_kernel /* private to the ke + pid_t shm_cprid; + pid_t shm_lprid; + struct user_struct *mlock_user; ++ struct ipc_ids *_shm_ids; + }; + + /* shm_mode upper byte flags */ +diff -uprN linux-2.6.15.orig/include/linux/shmem_fs.h linux-2.6.15-ve025stab014/include/linux/shmem_fs.h +--- linux-2.6.15.orig/include/linux/shmem_fs.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/shmem_fs.h 2006-01-27 14:48:06.000000000 +0300 +@@ -19,6 +19,9 @@ struct shmem_inode_info { + swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* first blocks */ + struct list_head swaplist; /* chain of maybes on swap */ + struct inode vfs_inode; ++#ifdef CONFIG_USER_RESOURCE ++ struct user_beancounter *shmi_ub; ++#endif + }; + + struct shmem_sb_info { +diff -uprN linux-2.6.15.orig/include/linux/signal.h linux-2.6.15-ve025stab014/include/linux/signal.h +--- linux-2.6.15.orig/include/linux/signal.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/signal.h 2006-01-27 14:48:06.000000000 +0300 +@@ -28,6 +28,9 @@ struct sigqueue { + int flags; + siginfo_t info; + struct user_struct *user; ++#ifdef CONFIG_USER_RESOURCE ++ struct user_beancounter *sig_ub; ++#endif + }; + + /* flags values. */ +diff -uprN linux-2.6.15.orig/include/linux/skbuff.h linux-2.6.15-ve025stab014/include/linux/skbuff.h +--- linux-2.6.15.orig/include/linux/skbuff.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/skbuff.h 2006-01-27 14:48:08.000000000 +0300 +@@ -19,6 +19,7 @@ + #include <linux/compiler.h> + #include <linux/time.h> + #include <linux/cache.h> ++#include <linux/ve_owner.h> + + #include <asm/atomic.h> + #include <asm/types.h> +@@ -212,6 +213,8 @@ enum { + * @tc_verd: traffic control verdict + */ + ++#include <ub/ub_sk.h> ++ + struct sk_buff { + /* These two members must be first. */ + struct sk_buff *next; +@@ -295,13 +298,18 @@ struct sk_buff { + *data, + *tail, + *end; ++ struct skb_beancounter skb_bc; ++ struct ve_struct *owner_env; + }; + ++DCL_VE_OWNER_PROTO(SKB, struct sk_buff, owner_env) ++ + #ifdef __KERNEL__ + /* + * Handling routines are only of interest to the kernel + */ + #include <linux/slab.h> ++#include <ub/ub_net.h> + + #include <asm/system.h> + +diff -uprN linux-2.6.15.orig/include/linux/slab.h linux-2.6.15-ve025stab014/include/linux/slab.h +--- linux-2.6.15.orig/include/linux/slab.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/slab.h 2006-01-27 14:48:06.000000000 +0300 +@@ -48,6 +48,26 @@ typedef struct kmem_cache kmem_cache_t; + #define SLAB_PANIC 0x00040000UL /* panic if kmem_cache_create() fails */ + #define SLAB_DESTROY_BY_RCU 0x00080000UL /* defer freeing pages to RCU */ + ++/* ++ * allocation rules: __GFP_UBC 0 ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ * cache (SLAB_UBC) charge charge ++ * (usual caches: mm, vma, task_struct, ...) ++ * ++ * cache (SLAB_UBC | SLAB_NO_CHARGE) charge --- ++ * (ub_kmalloc) (kmalloc) ++ * ++ * cache (no UB flags) BUG() --- ++ * (nonub caches, mempools) ++ * ++ * pages charge --- ++ * (ub_vmalloc, (vmalloc, ++ * poll, fdsets, ...) non-ub allocs) ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ */ ++#define SLAB_UBC 0x20000000UL /* alloc space for ubs ... */ ++#define SLAB_NO_CHARGE 0x40000000UL /* ... but don't charge */ ++ + /* flags passed to a constructor func */ + #define SLAB_CTOR_CONSTRUCTOR 0x001UL /* if not set, then deconstructor */ + #define SLAB_CTOR_ATOMIC 0x002UL /* tell constructor it can't sleep */ +diff -uprN linux-2.6.15.orig/include/linux/socket.h linux-2.6.15-ve025stab014/include/linux/socket.h +--- linux-2.6.15.orig/include/linux/socket.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/socket.h 2006-01-27 14:48:08.000000000 +0300 +@@ -298,6 +298,7 @@ extern int memcpy_toiovec(struct iovec * + extern int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ulen); + extern int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr); + extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); ++extern int vz_security_proto_check(int family, int type, int protocol); + + #endif + #endif /* not kernel and not glibc */ +diff -uprN linux-2.6.15.orig/include/linux/swap.h linux-2.6.15-ve025stab014/include/linux/swap.h +--- linux-2.6.15.orig/include/linux/swap.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/swap.h 2006-01-27 14:48:06.000000000 +0300 +@@ -80,6 +80,7 @@ struct address_space; + struct sysinfo; + struct writeback_control; + struct zone; ++struct user_beancounter; + + /* + * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of +@@ -119,6 +120,7 @@ enum { + /* + * The in-memory structure used to track swap areas. + */ ++struct user_beancounter; + struct swap_info_struct { + unsigned int flags; + int prio; /* swap priority */ +@@ -136,6 +138,9 @@ struct swap_info_struct { + unsigned int max; + unsigned int inuse_pages; + int next; /* next entry on swap list */ ++#ifdef CONFIG_USER_SWAP_ACCOUNTING ++ struct user_beancounter **swap_ubs; ++#endif + }; + + struct swap_list_t { +@@ -209,7 +214,7 @@ extern long total_swap_pages; + extern unsigned int nr_swapfiles; + extern struct swap_info_struct swap_info[]; + extern void si_swapinfo(struct sysinfo *); +-extern swp_entry_t get_swap_page(void); ++extern swp_entry_t get_swap_page(struct user_beancounter *); + extern int swap_duplicate(swp_entry_t); + extern int valid_swaphandles(swp_entry_t, unsigned long *); + extern void swap_free(swp_entry_t); +@@ -277,7 +282,7 @@ static inline int remove_exclusive_swap_ + return 0; + } + +-static inline swp_entry_t get_swap_page(void) ++static inline swp_entry_t get_swap_page(struct user_beancounter *ub) + { + swp_entry_t entry; + entry.val = 0; +diff -uprN linux-2.6.15.orig/include/linux/sysctl.h linux-2.6.15-ve025stab014/include/linux/sysctl.h +--- linux-2.6.15.orig/include/linux/sysctl.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/sysctl.h 2006-01-27 14:48:08.000000000 +0300 +@@ -146,6 +146,7 @@ enum + KERN_RANDOMIZE=68, /* int: randomize virtual address space */ + KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */ + KERN_SPIN_RETRY=70, /* int: number of spinlock retries */ ++ KERN_VIRT_PIDS=202, /* int: VE pids virtualization */ + }; + + +@@ -394,6 +395,7 @@ enum + + enum { + NET_IPV4_ROUTE_FLUSH=1, ++ NET_IPV4_ROUTE_SRC_CHECK=188, + NET_IPV4_ROUTE_MIN_DELAY=2, + NET_IPV4_ROUTE_MAX_DELAY=3, + NET_IPV4_ROUTE_GC_THRESH=4, +@@ -893,6 +895,8 @@ extern int proc_doulongvec_minmax(ctl_ta + void __user *, size_t *, loff_t *); + extern int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int, + struct file *, void __user *, size_t *, loff_t *); ++extern int proc_doutsstring(ctl_table *table, int write, struct file *, ++ void __user *, size_t *, loff_t *); + + extern int do_sysctl (int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, +@@ -947,6 +951,8 @@ extern ctl_handler sysctl_ms_jiffies; + */ + + /* A sysctl table is an array of struct ctl_table: */ ++struct ve_struct; ++ + struct ctl_table + { + int ctl_name; /* Binary ID */ +@@ -960,6 +966,7 @@ struct ctl_table + struct proc_dir_entry *de; /* /proc control block */ + void *extra1; + void *extra2; ++ struct ve_struct *owner_env; + }; + + /* struct ctl_table_header is used to maintain dynamic lists of +@@ -976,6 +983,9 @@ struct ctl_table_header * register_sysct + int insert_at_head); + void unregister_sysctl_table(struct ctl_table_header * table); + ++ctl_table *clone_sysctl_template(ctl_table *tmpl, int nr); ++void free_sysctl_clone(ctl_table *clone); ++ + #else /* __KERNEL__ */ + + #endif /* __KERNEL__ */ +diff -uprN linux-2.6.15.orig/include/linux/tty.h linux-2.6.15-ve025stab014/include/linux/tty.h +--- linux-2.6.15.orig/include/linux/tty.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/tty.h 2006-01-27 14:48:08.000000000 +0300 +@@ -297,8 +297,11 @@ struct tty_struct { + spinlock_t read_lock; + /* If the tty has a pending do_SAK, queue it here - akpm */ + struct work_struct SAK_work; ++ struct ve_struct *owner_env; + }; + ++DCL_VE_OWNER_PROTO(TTY, struct tty_struct, owner_env) ++ + /* tty magic number */ + #define TTY_MAGIC 0x5401 + +@@ -325,6 +328,7 @@ struct tty_struct { + #define TTY_PTY_LOCK 16 /* pty private */ + #define TTY_NO_WRITE_SPLIT 17 /* Preserve write boundaries to driver */ + #define TTY_HUPPED 18 /* Post driver->hangup() */ ++#define TTY_CHARGED 19 /* Charged as ub resource */ + + #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty)) + +diff -uprN linux-2.6.15.orig/include/linux/tty_driver.h linux-2.6.15-ve025stab014/include/linux/tty_driver.h +--- linux-2.6.15.orig/include/linux/tty_driver.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/tty_driver.h 2006-01-27 14:48:08.000000000 +0300 +@@ -115,6 +115,7 @@ + * character to the device. + */ + ++#include <linux/ve_owner.h> + #include <linux/fs.h> + #include <linux/list.h> + #include <linux/cdev.h> +@@ -214,9 +215,18 @@ struct tty_driver { + unsigned int set, unsigned int clear); + + struct list_head tty_drivers; ++ struct ve_struct *owner_env; + }; + ++DCL_VE_OWNER_PROTO(TTYDRV, struct tty_driver, owner_env) ++ ++#ifdef CONFIG_LEGACY_PTYS ++extern struct tty_driver *pty_driver; ++extern struct tty_driver *pty_slave_driver; ++#endif ++ + extern struct list_head tty_drivers; ++extern rwlock_t tty_driver_guard; + + struct tty_driver *alloc_tty_driver(int lines); + void put_tty_driver(struct tty_driver *driver); +diff -uprN linux-2.6.15.orig/include/linux/ve.h linux-2.6.15-ve025stab014/include/linux/ve.h +--- linux-2.6.15.orig/include/linux/ve.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/ve.h 2006-01-27 14:48:09.000000000 +0300 +@@ -0,0 +1,321 @@ ++/* ++ * include/linux/ve.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _LINUX_VE_H ++#define _LINUX_VE_H ++ ++#include <linux/config.h> ++ ++#ifndef __ENVID_T_DEFINED__ ++typedef unsigned envid_t; ++#define __ENVID_T_DEFINED__ ++#endif ++ ++#include <linux/types.h> ++#include <linux/capability.h> ++#include <linux/utsname.h> ++#include <linux/sysctl.h> ++#include <linux/vzstat.h> ++#include <linux/kobject.h> ++ ++#ifdef VZMON_DEBUG ++# define VZTRACE(fmt,args...) \ ++ printk(KERN_DEBUG fmt, ##args) ++#else ++# define VZTRACE(fmt,args...) ++#endif /* VZMON_DEBUG */ ++ ++struct tty_driver; ++struct devpts_config; ++struct task_struct; ++struct new_utsname; ++struct file_system_type; ++struct icmp_mib; ++struct ip_mib; ++struct tcp_mib; ++struct udp_mib; ++struct linux_mib; ++struct fib_info; ++struct fib_rule; ++struct veip_struct; ++struct ve_monitor; ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++struct fib_table; ++struct devcnfv4_struct; ++#ifdef CONFIG_VE_IPTABLES ++struct ipt_filter_initial_table; ++struct ipt_nat_initial_table; ++struct ipt_table; ++struct ip_conntrack; ++struct nf_hook_ops; ++typedef unsigned int (*ip_nat_helper_func)(void); ++struct ve_ip_conntrack { ++ struct list_head *_ip_conntrack_hash; ++ struct list_head _ip_conntrack_expect_list; ++ struct ip_conntrack_protocol ** _ip_ct_protos; ++ struct list_head _ip_conntrack_helpers; ++ int _ip_conntrack_max; ++ int _ip_conntrack_vmalloc; ++ atomic_t _ip_conntrack_count; ++ void (*_ip_conntrack_destroyed)(struct ip_conntrack *conntrack); ++#ifdef CONFIG_SYSCTL ++ unsigned long _ip_ct_tcp_timeouts[10]; ++ unsigned long _ip_ct_udp_timeout; ++ unsigned long _ip_ct_udp_timeout_stream; ++ unsigned long _ip_ct_icmp_timeout; ++ unsigned long _ip_ct_generic_timeout; ++ unsigned int _ip_ct_log_invalid; ++ unsigned long _ip_ct_tcp_timeout_max_retrans; ++ int _ip_ct_tcp_loose; ++ int _ip_ct_tcp_be_liberal; ++ int _ip_ct_tcp_max_retrans; ++ struct ctl_table_header *_ip_ct_sysctl_header; ++ ctl_table *_ip_ct_net_table; ++ ctl_table *_ip_ct_ipv4_table; ++ ctl_table *_ip_ct_netfilter_table; ++ ctl_table *_ip_ct_sysctl_table; ++#endif /*CONFIG_SYSCTL*/ ++ ++ struct ip_nat_protocol **_ip_nat_protos; ++ ip_nat_helper_func _ip_nat_ftp_hook; ++ ip_nat_helper_func _ip_nat_irc_hook; ++ struct list_head *_ip_nat_bysource; ++ struct ipt_table *_ip_nat_table; ++ ++ /* resource accounting */ ++ struct user_beancounter *ub; ++}; ++#endif ++#endif ++ ++#define UIDHASH_BITS_VE 6 ++#define UIDHASH_SZ_VE (1 << UIDHASH_BITS_VE) ++ ++struct ve_cpu_stats { ++ cycles_t idle_time; ++ cycles_t iowait_time; ++ cycles_t strt_idle_time; ++ cycles_t used_time; ++ seqcount_t stat_lock; ++ int nr_running; ++ int nr_unint; ++ int nr_iowait; ++ cputime64_t user; ++ cputime64_t nice; ++ cputime64_t system; ++} ____cacheline_aligned; ++ ++struct ve_struct { ++ struct ve_struct *prev; ++ struct ve_struct *next; ++ ++ envid_t veid; ++ struct task_struct *init_entry; ++ struct list_head vetask_lh; ++ kernel_cap_t cap_default; ++ atomic_t pcounter; ++ /* ref counter to ve from ipc */ ++ atomic_t counter; ++ unsigned int class_id; ++ struct veip_struct *veip; ++ struct rw_semaphore op_sem; ++ int is_running; ++ int virt_pids; ++ ++/* VE's root */ ++ struct vfsmount *fs_rootmnt; ++ struct dentry *fs_root; ++ ++/* sysctl */ ++ struct new_utsname *utsname; ++ struct list_head sysctl_lh; ++ struct ctl_table_header *kern_header; ++ struct ctl_table *kern_table; ++ struct ctl_table_header *quota_header; ++ struct ctl_table *quota_table; ++ struct file_system_type *proc_fstype; ++ struct vfsmount *proc_mnt; ++ struct proc_dir_entry *proc_root; ++ struct proc_dir_entry *proc_sys_root; ++ struct proc_dir_entry *_proc_net; ++ struct proc_dir_entry *_proc_net_stat; ++ ++/* SYSV IPC */ ++ struct ipc_ids *_shm_ids; ++ struct ipc_ids *_msg_ids; ++ struct ipc_ids *_sem_ids; ++ int _used_sems; ++ int _shm_tot; ++ size_t _shm_ctlmax; ++ size_t _shm_ctlall; ++ int _shm_ctlmni; ++ int _msg_ctlmax; ++ int _msg_ctlmni; ++ int _msg_ctlmnb; ++ int _sem_ctls[4]; ++ ++/* BSD pty's */ ++ struct tty_driver *pty_driver; ++ struct tty_driver *pty_slave_driver; ++ ++#ifdef CONFIG_UNIX98_PTYS ++ struct tty_driver *ptm_driver; ++ struct tty_driver *pts_driver; ++ struct idr *allocated_ptys; ++ struct file_system_type *devpts_fstype; ++ struct vfsmount *devpts_mnt; ++ struct dentry *devpts_root; ++ struct devpts_config *devpts_config; ++#endif ++ ++ struct file_system_type *shmem_fstype; ++ struct vfsmount *shmem_mnt; ++#ifdef CONFIG_VE_SYSFS ++ struct file_system_type *sysfs_fstype; ++ struct vfsmount *sysfs_mnt; ++ struct super_block *sysfs_sb; ++#endif ++ struct subsystem *class_subsys; ++ struct subsystem *class_obj_subsys; ++ struct class *net_class; ++ ++/* User uids hash */ ++ struct list_head uidhash_table[UIDHASH_SZ_VE]; ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ struct hlist_head _net_dev_head; ++ struct hlist_head _net_dev_index_head; ++ struct net_device *_net_dev_base, **_net_dev_tail; ++ int ifindex; ++ struct net_device *_loopback_dev; ++ struct net_device *_venet_dev; ++ struct ipv4_devconf *_ipv4_devconf; ++ struct ipv4_devconf *_ipv4_devconf_dflt; ++ struct ctl_table_header *forward_header; ++ struct ctl_table *forward_table; ++#endif ++ unsigned long rt_flush_required; ++ ++/* per VE CPU stats*/ ++ struct timespec start_timespec; ++ u64 start_jiffies; ++ cycles_t start_cycles; ++ unsigned long avenrun[3]; /* loadavg data */ ++ ++ cycles_t cpu_used_ve; ++ struct kstat_lat_pcpu_struct sched_lat_ve; ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ struct hlist_head *_fib_info_hash; ++ struct hlist_head *_fib_info_laddrhash; ++ int _fib_hash_size; ++ int _fib_info_cnt; ++ ++ struct fib_rule *_local_rule; ++ struct fib_rule *_fib_rules; ++#ifdef CONFIG_IP_MULTIPLE_TABLES ++ /* XXX: why a magic constant? */ ++ struct fib_table *_fib_tables[256]; /* RT_TABLE_MAX - for now */ ++#else ++ struct fib_table *_main_table; ++ struct fib_table *_local_table; ++#endif ++ struct icmp_mib *_icmp_statistics[2]; ++ struct ipstats_mib *_ip_statistics[2]; ++ struct tcp_mib *_tcp_statistics[2]; ++ struct udp_mib *_udp_statistics[2]; ++ struct linux_mib *_net_statistics[2]; ++ struct venet_stat *stat; ++#ifdef CONFIG_VE_IPTABLES ++/* core/netfilter.c virtualization */ ++ void *_nf_hooks; ++ struct ipt_table *_ve_ipt_filter_pf; /* packet_filter struct */ ++ struct ipt_table *_ipt_mangle_table; ++ struct list_head *_ipt_target; ++ struct list_head *_ipt_match; ++ struct list_head *_ipt_tables; ++ ++ struct ipt_target *_ipt_standard_target; ++ struct ipt_target *_ipt_error_target; ++ struct ipt_match *_tcp_matchstruct; ++ struct ipt_match *_udp_matchstruct; ++ struct ipt_match *_icmp_matchstruct; ++ ++ __u64 _iptables_modules; ++ struct ve_ip_conntrack *_ip_conntrack; ++#endif /* CONFIG_VE_IPTABLES */ ++#endif ++ wait_queue_head_t *_log_wait; ++ unsigned long *_log_start; ++ unsigned long *_log_end; ++ unsigned long *_logged_chars; ++ char *log_buf; ++#define VE_DEFAULT_LOG_BUF_LEN 4096 ++ ++ struct ve_cpu_stats ve_cpu_stats[NR_CPUS] ____cacheline_aligned; ++ unsigned long down_at; ++ struct list_head cleanup_list; ++ ++ unsigned char sparse_vpid; ++ struct ve_monitor *monitor; ++ struct proc_dir_entry *monitor_proc; ++}; ++ ++#define VE_CPU_STATS(ve, cpu) (&((ve)->ve_cpu_stats[(cpu)])) ++ ++extern int nr_ve; ++ ++#ifdef CONFIG_VE ++ ++#ifdef CONFIG_VE_CALLS ++#define get_device_perms_ve real_get_device_perms_ve ++#define do_env_cleanup real_do_env_cleanup ++#define do_env_free real_do_env_free ++#define do_update_load_avg_ve real_update_load_avg_ve ++#endif ++ ++int get_device_perms_ve(int dev_type, dev_t dev, int access_mode); ++void do_env_cleanup(struct ve_struct *envid); ++void do_update_load_avg_ve(void); ++void do_env_free(struct ve_struct *ptr); ++ ++#define ve_utsname (*get_exec_env()->utsname) ++ ++static inline struct ve_struct *get_ve(struct ve_struct *ptr) ++{ ++ if (ptr != NULL) ++ atomic_inc(&ptr->counter); ++ return ptr; ++} ++ ++static inline void put_ve(struct ve_struct *ptr) ++{ ++ if (ptr && atomic_dec_and_test(&ptr->counter)) { ++ if (atomic_read(&ptr->pcounter) > 0) ++ BUG(); ++ if (ptr->is_running) ++ BUG(); ++ do_env_free(ptr); ++ } ++} ++ ++#ifdef CONFIG_FAIRSCHED ++#define ve_cpu_online_map(ve, mask) fairsched_cpu_online_map(ve->veid, mask) ++#else ++#define ve_cpu_online_map(ve, mask) do { *(mask) = cpu_online_map; } while (0) ++#endif ++#else /* CONFIG_VE */ ++#define ve_utsname system_utsname ++#define get_ve(ve) (NULL) ++#define put_ve(ve) do { } while (0) ++#endif /* CONFIG_VE */ ++ ++#endif /* _LINUX_VE_H */ +diff -uprN linux-2.6.15.orig/include/linux/ve_owner.h linux-2.6.15-ve025stab014/include/linux/ve_owner.h +--- linux-2.6.15.orig/include/linux/ve_owner.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/ve_owner.h 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,32 @@ ++/* ++ * include/linux/ve_owner.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __VE_OWNER_H__ ++#define __VE_OWNER_H__ ++ ++#include <linux/config.h> ++#include <linux/vmalloc.h> ++ ++ ++#define DCL_VE_OWNER(name, type, member) ++ /* prototype declares static inline functions */ ++ ++#define DCL_VE_OWNER_PROTO(name, type, member) \ ++type; \ ++static inline struct ve_struct *VE_OWNER_##name(const type *obj) \ ++{ \ ++ return obj->member; \ ++} \ ++static inline void SET_VE_OWNER_##name(type *obj, struct ve_struct *ve) \ ++{ \ ++ obj->member = ve; \ ++} ++ ++#endif /* __VE_OWNER_H__ */ +diff -uprN linux-2.6.15.orig/include/linux/ve_proto.h linux-2.6.15-ve025stab014/include/linux/ve_proto.h +--- linux-2.6.15.orig/include/linux/ve_proto.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/ve_proto.h 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,96 @@ ++/* ++ * include/linux/ve_proto.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __VE_H__ ++#define __VE_H__ ++ ++#ifdef CONFIG_VE ++ ++extern struct semaphore ve_call_guard; ++extern rwlock_t ve_call_lock; ++ ++#ifdef CONFIG_SYSVIPC ++extern void prepare_ipc(void); ++extern int init_ve_ipc(struct ve_struct *); ++extern void fini_ve_ipc(struct ve_struct *); ++extern void ve_ipc_cleanup(void); ++#endif ++ ++#ifdef CONFIG_UNIX98_PTYS ++extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ ++extern struct tty_driver *pts_driver; /* Unix98 pty slaves; for /dev/ptmx */ ++#endif ++ ++extern rwlock_t tty_driver_guard; ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++void ip_fragment_cleanup(struct ve_struct *envid); ++void tcp_v4_kill_ve_sockets(struct ve_struct *envid); ++struct fib_table * fib_hash_init(int id); ++int move_addr_to_kernel(void *uaddr, int ulen, void *kaddr); ++extern int main_loopback_init(struct net_device*); ++int venet_init(void); ++#endif ++ ++extern struct ve_struct *ve_list_head; ++extern rwlock_t ve_list_guard; ++extern struct ve_struct *get_ve_by_id(envid_t); ++extern struct ve_struct *__find_ve_by_id(envid_t); ++ ++extern int do_setdevperms(envid_t veid, unsigned type, ++ dev_t dev, unsigned mask); ++ ++#ifdef CONFIG_VE_IPTABLES ++extern int init_netfilter(void); ++extern int init_iptables(void); ++extern int init_iptable_filter(void); ++extern int init_iptable_limit(void); ++extern int init_iptable_multiport(void); ++extern int init_iptable_tos(void); ++extern int init_iptable_REJECT(void); ++extern void fini_netfilter(void); ++extern int fini_iptables(void); ++extern int fini_iptable_filter(void); ++extern int fini_iptable_limit(void); ++extern int fini_iptable_multiport(void); ++extern int fini_iptable_tos(void); ++extern int fini_iptable_REJECT(void); ++#endif ++ ++#define VE_HOOK_INIT 0 ++#define VE_HOOK_FINI 1 ++#define VE_MAX_HOOKS 2 ++ ++typedef int ve_hookfn(unsigned int hooknum, void *data); ++ ++struct ve_hook ++{ ++ struct list_head list; ++ ve_hookfn *hook; ++ ve_hookfn *undo; ++ struct module *owner; ++ int hooknum; ++ /* Functions are called in ascending priority. */ ++ int priority; ++}; ++ ++extern int ve_hook_register(struct ve_hook *vh); ++extern void ve_hook_unregister(struct ve_hook *vh); ++ ++struct ve_hook_init_data ++{ ++ struct ve_struct *env; ++ u32 class_id; ++ struct env_create_param *data; ++ int datalen; ++}; ++ ++#endif ++#endif +diff -uprN linux-2.6.15.orig/include/linux/ve_task.h linux-2.6.15-ve025stab014/include/linux/ve_task.h +--- linux-2.6.15.orig/include/linux/ve_task.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/ve_task.h 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,34 @@ ++/* ++ * include/linux/ve_task.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __VE_TASK_H__ ++#define __VE_TASK_H__ ++ ++#include <linux/seqlock.h> ++ ++struct ve_task_info { ++/* virtualization */ ++ struct ve_struct *owner_env; ++ struct ve_struct *exec_env; ++ struct list_head vetask_list; ++ struct dentry *glob_proc_dentry; ++/* statistics: scheduling latency */ ++ cycles_t sleep_time; ++ cycles_t sched_time; ++ cycles_t sleep_stamp; ++ cycles_t wakeup_stamp; ++ seqcount_t wakeup_lock; ++}; ++ ++#define VE_TASK_INFO(task) (&(task)->ve_task_info) ++#define VE_TASK_LIST_2_TASK(lh) \ ++ list_entry(lh, struct task_struct, ve_task_info.vetask_list) ++ ++#endif /* __VE_TASK_H__ */ +diff -uprN linux-2.6.15.orig/include/linux/venet.h linux-2.6.15-ve025stab014/include/linux/venet.h +--- linux-2.6.15.orig/include/linux/venet.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/venet.h 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,68 @@ ++/* ++ * include/linux/venet.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _VENET_H ++#define _VENET_H ++ ++#include <linux/list.h> ++#include <linux/spinlock.h> ++#include <linux/vzcalluser.h> ++ ++#define VEIP_HASH_SZ 512 ++ ++struct ve_struct; ++struct venet_stat; ++struct ip_entry_struct ++{ ++ __u32 ip; ++ struct ve_struct *active_env; ++ struct venet_stat *stat; ++ struct veip_struct *veip; ++ struct list_head ip_hash; ++ struct list_head ve_list; ++}; ++ ++struct veip_struct ++{ ++ struct list_head src_lh; ++ struct list_head dst_lh; ++ struct list_head ip_lh; ++ struct list_head list; ++ envid_t veid; ++}; ++ ++/* veip_hash_lock should be taken for write by caller */ ++void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip); ++/* veip_hash_lock should be taken for write by caller */ ++void ip_entry_unhash(struct ip_entry_struct *entry); ++/* veip_hash_lock should be taken for read by caller */ ++struct ip_entry_struct *ip_entry_lookup(u32 addr); ++ ++/* veip_hash_lock should be taken for read by caller */ ++struct veip_struct *veip_find(envid_t veid); ++/* veip_hash_lock should be taken for write by caller */ ++struct veip_struct *veip_findcreate(envid_t veid); ++/* veip_hash_lock should be taken for write by caller */ ++void veip_put(struct veip_struct *veip); ++ ++int veip_start(struct ve_struct *ve); ++void veip_stop(struct ve_struct *ve); ++int veip_entry_add(struct ve_struct *ve, struct sockaddr_in *addr); ++int veip_entry_del(envid_t veid, struct sockaddr_in *addr); ++int venet_change_skb_owner(struct sk_buff *skb); ++ ++extern struct list_head ip_entry_hash_table[]; ++extern rwlock_t veip_hash_lock; ++ ++#ifdef CONFIG_PROC_FS ++int veip_seq_show(struct seq_file *m, void *v); ++#endif ++ ++#endif +diff -uprN linux-2.6.15.orig/include/linux/virtinfo.h linux-2.6.15-ve025stab014/include/linux/virtinfo.h +--- linux-2.6.15.orig/include/linux/virtinfo.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/virtinfo.h 2006-01-27 14:48:07.000000000 +0300 +@@ -0,0 +1,52 @@ ++/* ++ * include/linux/virtinfo.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __LINUX_VIRTINFO_H ++#define __LINUX_VIRTINFO_H ++ ++#include <linux/kernel.h> ++#include <linux/page-flags.h> ++#include <linux/rwsem.h> ++#include <linux/notifier.h> ++ ++struct vnotifier_block ++{ ++ int (*notifier_call)(struct vnotifier_block *self, ++ unsigned long, void *, int); ++ struct vnotifier_block *next; ++ int priority; ++}; ++ ++void virtinfo_notifier_register(int type, struct vnotifier_block *nb); ++void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb); ++int virtinfo_notifier_call(int type, unsigned long n, void *data); ++ ++struct meminfo { ++ struct sysinfo si; ++ unsigned long active, inactive; ++ unsigned long cache, swapcache; ++ unsigned long committed_space; ++ unsigned long allowed; ++ struct page_state ps; ++ unsigned long vmalloc_total, vmalloc_used, vmalloc_largest; ++}; ++ ++#define VIRTINFO_MEMINFO 0 ++#define VIRTINFO_ENOUGHMEM 1 ++ ++enum virt_info_types { ++ VITYPE_GENERAL, ++ VITYPE_FAUDIT, ++ VITYPE_QUOTA, ++ ++ VIRT_TYPES ++}; ++ ++#endif /* __LINUX_VIRTINFO_H */ +diff -uprN linux-2.6.15.orig/include/linux/vmalloc.h linux-2.6.15-ve025stab014/include/linux/vmalloc.h +--- linux-2.6.15.orig/include/linux/vmalloc.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/vmalloc.h 2006-01-27 14:48:08.000000000 +0300 +@@ -18,6 +18,10 @@ + #define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */ + #endif + ++/* align size to 2^n page boundary */ ++#define POWER2_PAGE_ALIGN(size) \ ++ ((typeof(size))(1UL << (PAGE_SHIFT + get_order(size)))) ++ + struct vm_struct { + void *addr; + unsigned long size; +@@ -36,6 +40,8 @@ extern void *vmalloc_node(unsigned long + extern void *vmalloc_exec(unsigned long size); + extern void *vmalloc_32(unsigned long size); + extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); ++extern void *vmalloc_best(unsigned long size); ++extern void *ub_vmalloc_best(unsigned long size); + extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, + pgprot_t prot); + extern void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, +@@ -52,6 +58,9 @@ extern void vunmap(void *addr); + extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); + extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end); ++extern struct vm_struct * get_vm_area_best(unsigned long size, ++ unsigned long flags); ++extern void vprintstat(void); + extern struct vm_struct *get_vm_area_node(unsigned long size, + unsigned long flags, int node); + extern struct vm_struct *remove_vm_area(void *addr); +diff -uprN linux-2.6.15.orig/include/linux/vzcalluser.h linux-2.6.15-ve025stab014/include/linux/vzcalluser.h +--- linux-2.6.15.orig/include/linux/vzcalluser.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/vzcalluser.h 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,206 @@ ++/* ++ * include/linux/vzcalluser.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _LINUX_VZCALLUSER_H ++#define _LINUX_VZCALLUSER_H ++ ++#include <linux/types.h> ++#include <linux/ioctl.h> ++ ++#define KERN_VZ_PRIV_RANGE 51 ++ ++#ifndef __ENVID_T_DEFINED__ ++typedef unsigned envid_t; ++#define __ENVID_T_DEFINED__ ++#endif ++ ++/* ++ * VE management ioctls ++ */ ++ ++struct vzctl_old_env_create { ++ envid_t veid; ++ unsigned flags; ++#define VE_CREATE 1 /* Create VE, VE_ENTER added automatically */ ++#define VE_EXCLUSIVE 2 /* Fail if exists */ ++#define VE_ENTER 4 /* Enter existing VE */ ++#define VE_TEST 8 /* Test if VE exists */ ++ __u32 addr; ++}; ++ ++struct vzctl_mark_env_to_down { ++ envid_t veid; ++}; ++ ++struct vzctl_setdevperms { ++ envid_t veid; ++ unsigned type; ++#define VE_USE_MAJOR 010 /* Test MAJOR supplied in rule */ ++#define VE_USE_MINOR 030 /* Test MINOR supplied in rule */ ++#define VE_USE_MASK 030 /* Testing mask, VE_USE_MAJOR|VE_USE_MINOR */ ++ unsigned dev; ++ unsigned mask; ++}; ++ ++struct vzctl_ve_netdev { ++ envid_t veid; ++ int op; ++#define VE_NETDEV_ADD 1 ++#define VE_NETDEV_DEL 2 ++ char *dev_name; ++}; ++ ++/* these masks represent modules */ ++#define VE_IP_IPTABLES_MOD (1U<<0) ++#define VE_IP_FILTER_MOD (1U<<1) ++#define VE_IP_MANGLE_MOD (1U<<2) ++#define VE_IP_MATCH_LIMIT_MOD (1U<<3) ++#define VE_IP_MATCH_MULTIPORT_MOD (1U<<4) ++#define VE_IP_MATCH_TOS_MOD (1U<<5) ++#define VE_IP_TARGET_TOS_MOD (1U<<6) ++#define VE_IP_TARGET_REJECT_MOD (1U<<7) ++#define VE_IP_TARGET_TCPMSS_MOD (1U<<8) ++#define VE_IP_MATCH_TCPMSS_MOD (1U<<9) ++#define VE_IP_MATCH_TTL_MOD (1U<<10) ++#define VE_IP_TARGET_LOG_MOD (1U<<11) ++#define VE_IP_MATCH_LENGTH_MOD (1U<<12) ++#define VE_IP_CONNTRACK_MOD (1U<<14) ++#define VE_IP_CONNTRACK_FTP_MOD (1U<<15) ++#define VE_IP_CONNTRACK_IRC_MOD (1U<<16) ++#define VE_IP_MATCH_CONNTRACK_MOD (1U<<17) ++#define VE_IP_MATCH_STATE_MOD (1U<<18) ++#define VE_IP_MATCH_HELPER_MOD (1U<<19) ++#define VE_IP_NAT_MOD (1U<<20) ++#define VE_IP_NAT_FTP_MOD (1U<<21) ++#define VE_IP_NAT_IRC_MOD (1U<<22) ++ ++/* these masks represent modules with their dependences */ ++#define VE_IP_IPTABLES (VE_IP_IPTABLES_MOD) ++#define VE_IP_FILTER (VE_IP_FILTER_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_MANGLE (VE_IP_MANGLE_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_MATCH_LIMIT (VE_IP_MATCH_LIMIT_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_MATCH_MULTIPORT (VE_IP_MATCH_MULTIPORT_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_MATCH_TOS (VE_IP_MATCH_TOS_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_TARGET_TOS (VE_IP_TARGET_TOS_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_TARGET_REJECT (VE_IP_TARGET_REJECT_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_TARGET_TCPMSS (VE_IP_TARGET_TCPMSS_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_MATCH_TCPMSS (VE_IP_MATCH_TCPMSS_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_MATCH_TTL (VE_IP_MATCH_TTL_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_TARGET_LOG (VE_IP_TARGET_LOG_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_MATCH_LENGTH (VE_IP_MATCH_LENGTH_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_CONNTRACK (VE_IP_CONNTRACK_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_CONNTRACK_FTP (VE_IP_CONNTRACK_FTP_MOD \ ++ | VE_IP_CONNTRACK) ++#define VE_IP_CONNTRACK_IRC (VE_IP_CONNTRACK_IRC_MOD \ ++ | VE_IP_CONNTRACK) ++#define VE_IP_MATCH_CONNTRACK (VE_IP_MATCH_CONNTRACK_MOD \ ++ | VE_IP_CONNTRACK) ++#define VE_IP_MATCH_STATE (VE_IP_MATCH_STATE_MOD \ ++ | VE_IP_CONNTRACK) ++#define VE_IP_MATCH_HELPER (VE_IP_MATCH_HELPER_MOD \ ++ | VE_IP_CONNTRACK) ++#define VE_IP_NAT (VE_IP_NAT_MOD \ ++ | VE_IP_CONNTRACK) ++#define VE_IP_NAT_FTP (VE_IP_NAT_FTP_MOD \ ++ | VE_IP_NAT | VE_IP_CONNTRACK_FTP) ++#define VE_IP_NAT_IRC (VE_IP_NAT_IRC_MOD \ ++ | VE_IP_NAT | VE_IP_CONNTRACK_IRC) ++ ++/* safe iptables mask to be used by default */ ++#define VE_IP_DEFAULT \ ++ (VE_IP_IPTABLES | \ ++ VE_IP_FILTER | VE_IP_MANGLE | \ ++ VE_IP_MATCH_LIMIT | VE_IP_MATCH_MULTIPORT | \ ++ VE_IP_MATCH_TOS | VE_IP_TARGET_REJECT | \ ++ VE_IP_TARGET_TCPMSS | VE_IP_MATCH_TCPMSS | \ ++ VE_IP_MATCH_TTL | VE_IP_MATCH_LENGTH) ++ ++#define VE_IPT_CMP(x,y) (((x) & (y)) == (y)) ++ ++struct vzctl_env_create_cid { ++ envid_t veid; ++ unsigned flags; ++ __u32 class_id; ++}; ++ ++struct vzctl_env_create { ++ envid_t veid; ++ unsigned flags; ++ __u32 class_id; ++}; ++ ++struct env_create_param { ++ __u64 iptables_mask; ++#define VZCTL_ENV_CREATE_DATA_MINLEN sizeof(__u64) ++#define VZCTL_ENV_CREATE_DATA_MAXLEN sizeof(__u64) ++}; ++ ++struct vzctl_env_create_data { ++ envid_t veid; ++ unsigned flags; ++ __u32 class_id; ++ struct env_create_param *data; ++ int datalen; ++}; ++ ++struct vz_load_avg { ++ int val_int; ++ int val_frac; ++}; ++ ++struct vz_cpu_stat { ++ unsigned long user_jif; ++ unsigned long nice_jif; ++ unsigned long system_jif; ++ unsigned long uptime_jif; ++ cycles_t idle_clk; ++ cycles_t strv_clk; ++ cycles_t uptime_clk; ++ struct vz_load_avg avenrun[3]; /* loadavg data */ ++}; ++ ++struct vzctl_cpustatctl { ++ envid_t veid; ++ struct vz_cpu_stat *cpustat; ++}; ++ ++#define VZCTLTYPE '.' ++#define VZCTL_OLD_ENV_CREATE _IOW(VZCTLTYPE, 0, \ ++ struct vzctl_old_env_create) ++#define VZCTL_MARK_ENV_TO_DOWN _IOW(VZCTLTYPE, 1, \ ++ struct vzctl_mark_env_to_down) ++#define VZCTL_SETDEVPERMS _IOW(VZCTLTYPE, 2, \ ++ struct vzctl_setdevperms) ++#define VZCTL_ENV_CREATE_CID _IOW(VZCTLTYPE, 4, \ ++ struct vzctl_env_create_cid) ++#define VZCTL_ENV_CREATE _IOW(VZCTLTYPE, 5, \ ++ struct vzctl_env_create) ++#define VZCTL_GET_CPU_STAT _IOW(VZCTLTYPE, 6, \ ++ struct vzctl_cpustatctl) ++#define VZCTL_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10, \ ++ struct vzctl_env_create_data) ++#define VZCTL_VE_NETDEV _IOW(VZCTLTYPE, 11, \ ++ struct vzctl_ve_netdev) ++ ++ ++#endif +diff -uprN linux-2.6.15.orig/include/linux/vzctl.h linux-2.6.15-ve025stab014/include/linux/vzctl.h +--- linux-2.6.15.orig/include/linux/vzctl.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/vzctl.h 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,30 @@ ++/* ++ * include/linux/vzctl.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _LINUX_VZCTL_H ++#define _LINUX_VZCTL_H ++ ++#include <linux/list.h> ++ ++struct module; ++struct inode; ++struct file; ++struct vzioctlinfo { ++ unsigned type; ++ int (*func)(struct inode *, struct file *, ++ unsigned int, unsigned long); ++ struct module *owner; ++ struct list_head list; ++}; ++ ++extern void vzioctl_register(struct vzioctlinfo *inf); ++extern void vzioctl_unregister(struct vzioctlinfo *inf); ++ ++#endif +diff -uprN linux-2.6.15.orig/include/linux/vzctl_quota.h linux-2.6.15-ve025stab014/include/linux/vzctl_quota.h +--- linux-2.6.15.orig/include/linux/vzctl_quota.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/vzctl_quota.h 2006-01-27 14:48:09.000000000 +0300 +@@ -0,0 +1,43 @@ ++/* ++ * include/linux/vzctl_quota.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __LINUX_VZCTL_QUOTA_H__ ++#define __LINUX_VZCTL_QUOTA_H__ ++ ++/* ++ * Quota management ioctl ++ */ ++ ++struct vz_quota_stat; ++struct vzctl_quotactl { ++ int cmd; ++ unsigned int quota_id; ++ struct vz_quota_stat *qstat; ++ char *ve_root; ++}; ++ ++struct vzctl_quotaugidctl { ++ int cmd; /* subcommand */ ++ unsigned int quota_id; /* quota id where it applies to */ ++ unsigned int ugid_index;/* for reading statistic. index of first ++ uid/gid record to read */ ++ unsigned int ugid_size; /* size of ugid_buf array */ ++ void *addr; /* user-level buffer */ ++}; ++ ++#define VZDQCTLTYPE '+' ++#define VZCTL_QUOTA_CTL _IOWR(VZDQCTLTYPE, 1, \ ++ struct vzctl_quotactl) ++#define VZCTL_QUOTA_NEW_CTL _IOWR(VZDQCTLTYPE, 2, \ ++ struct vzctl_quotactl) ++#define VZCTL_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3, \ ++ struct vzctl_quotaugidctl) ++ ++#endif /* __LINUX_VZCTL_QUOTA_H__ */ +diff -uprN linux-2.6.15.orig/include/linux/vzctl_venet.h linux-2.6.15-ve025stab014/include/linux/vzctl_venet.h +--- linux-2.6.15.orig/include/linux/vzctl_venet.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/vzctl_venet.h 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,36 @@ ++/* ++ * include/linux/vzctl_venet.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _VZCTL_VENET_H ++#define _VZCTL_VENET_H ++ ++#include <linux/types.h> ++#include <linux/ioctl.h> ++ ++#ifndef __ENVID_T_DEFINED__ ++typedef unsigned envid_t; ++#define __ENVID_T_DEFINED__ ++#endif ++ ++struct vzctl_ve_ip_map { ++ envid_t veid; ++ int op; ++#define VE_IP_ADD 1 ++#define VE_IP_DEL 2 ++ struct sockaddr *addr; ++ int addrlen; ++}; ++ ++#define VENETCTLTYPE '(' ++ ++#define VENETCTL_VE_IP_MAP _IOW(VENETCTLTYPE, 3, \ ++ struct vzctl_ve_ip_map) ++ ++#endif +diff -uprN linux-2.6.15.orig/include/linux/vzdq_tree.h linux-2.6.15-ve025stab014/include/linux/vzdq_tree.h +--- linux-2.6.15.orig/include/linux/vzdq_tree.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/vzdq_tree.h 2006-01-27 14:48:09.000000000 +0300 +@@ -0,0 +1,99 @@ ++/* ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains Virtuozzo disk quota tree definition ++ */ ++ ++#ifndef _VZDQ_TREE_H ++#define _VZDQ_TREE_H ++ ++#include <linux/list.h> ++#include <asm/string.h> ++ ++typedef unsigned int quotaid_t; ++#define QUOTAID_BITS 32 ++#define QUOTAID_BBITS 4 ++#define QUOTAID_EBITS 8 ++ ++#if QUOTAID_EBITS % QUOTAID_BBITS ++#error Quota bit assumption failure ++#endif ++ ++#define QUOTATREE_BSIZE (1 << QUOTAID_BBITS) ++#define QUOTATREE_BMASK (QUOTATREE_BSIZE - 1) ++#define QUOTATREE_DEPTH ((QUOTAID_BITS + QUOTAID_BBITS - 1) \ ++ / QUOTAID_BBITS) ++#define QUOTATREE_EDEPTH ((QUOTAID_BITS + QUOTAID_EBITS - 1) \ ++ / QUOTAID_EBITS) ++#define QUOTATREE_BSHIFT(lvl) ((QUOTATREE_DEPTH - (lvl) - 1) * QUOTAID_BBITS) ++ ++/* ++ * Depth of keeping unused node (not inclusive). ++ * 0 means release all nodes including root, ++ * QUOTATREE_DEPTH means never release nodes. ++ * Current value: release all nodes strictly after QUOTATREE_EDEPTH ++ * (measured in external shift units). ++ */ ++#define QUOTATREE_CDEPTH (QUOTATREE_DEPTH \ ++ - 2 * QUOTATREE_DEPTH / QUOTATREE_EDEPTH \ ++ + 1) ++ ++/* ++ * Levels 0..(QUOTATREE_DEPTH-1) are tree nodes. ++ * On level i the maximal number of nodes is 2^(i*QUOTAID_BBITS), ++ * and each node contains 2^QUOTAID_BBITS pointers. ++ * Level 0 is a (single) tree root node. ++ * ++ * Nodes of level (QUOTATREE_DEPTH-1) contain pointers to caller's data. ++ * Nodes of lower levels contain pointers to nodes. ++ * ++ * Double pointer in array of i-level node, pointing to a (i+1)-level node ++ * (such as inside quotatree_find_state) are marked by level (i+1), not i. ++ * Level 0 double pointer is a pointer to root inside tree struct. ++ * ++ * The tree is permanent, i.e. all index blocks allocated are keeped alive to ++ * preserve the blocks numbers in the quota file tree to keep its changes ++ * locally. ++ */ ++struct quotatree_node { ++ struct list_head list; ++ quotaid_t num; ++ void *blocks[QUOTATREE_BSIZE]; ++}; ++ ++struct quotatree_level { ++ struct list_head usedlh, freelh; ++ quotaid_t freenum; ++}; ++ ++struct quotatree_tree { ++ struct quotatree_level levels[QUOTATREE_DEPTH]; ++ struct quotatree_node *root; ++ unsigned int leaf_num; ++}; ++ ++struct quotatree_find_state { ++ void **block; ++ int level; ++}; ++ ++/* number of leafs (objects) and leaf level of the tree */ ++#define QTREE_LEAFNUM(tree) ((tree)->leaf_num) ++#define QTREE_LEAFLVL(tree) (&(tree)->levels[QUOTATREE_DEPTH - 1]) ++ ++struct quotatree_tree *quotatree_alloc(void); ++void *quotatree_find(struct quotatree_tree *tree, quotaid_t id, ++ struct quotatree_find_state *st); ++int quotatree_insert(struct quotatree_tree *tree, quotaid_t id, ++ struct quotatree_find_state *st, void *data); ++void quotatree_remove(struct quotatree_tree *tree, quotaid_t id); ++void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *)); ++void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id); ++void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index); ++ ++#endif /* _VZDQ_TREE_H */ ++ +diff -uprN linux-2.6.15.orig/include/linux/vzquota.h linux-2.6.15-ve025stab014/include/linux/vzquota.h +--- linux-2.6.15.orig/include/linux/vzquota.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/vzquota.h 2006-01-27 14:48:09.000000000 +0300 +@@ -0,0 +1,288 @@ ++/* ++ * ++ * Copyright (C) 2001-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains Virtuozzo disk quota implementation ++ */ ++ ++#ifndef _VZDQUOTA_H ++#define _VZDQUOTA_H ++ ++#include <linux/types.h> ++#include <linux/quota.h> ++ ++/* vzquotactl syscall commands */ ++#define VZ_DQ_CREATE 5 /* create quota master block */ ++#define VZ_DQ_DESTROY 6 /* destroy qmblk */ ++#define VZ_DQ_ON 7 /* mark dentry with already created qmblk */ ++#define VZ_DQ_OFF 8 /* remove mark, don't destroy qmblk */ ++#define VZ_DQ_SETLIMIT 9 /* set new limits */ ++#define VZ_DQ_GETSTAT 10 /* get usage statistic */ ++/* set of syscalls to maintain UGID quotas */ ++#define VZ_DQ_UGID_GETSTAT 1 /* get usage/limits for ugid(s) */ ++#define VZ_DQ_UGID_ADDSTAT 2 /* set usage/limits statistic for ugid(s) */ ++#define VZ_DQ_UGID_GETGRACE 3 /* get expire times */ ++#define VZ_DQ_UGID_SETGRACE 4 /* set expire times */ ++#define VZ_DQ_UGID_GETCONFIG 5 /* get ugid_max limit, cnt, flags of qmblk */ ++#define VZ_DQ_UGID_SETCONFIG 6 /* set ugid_max limit, flags of qmblk */ ++#define VZ_DQ_UGID_SETLIMIT 7 /* set ugid B/I limits */ ++#define VZ_DQ_UGID_SETINFO 8 /* set ugid info */ ++ ++/* common structure for vz and ugid quota */ ++struct dq_stat { ++ /* blocks limits */ ++ __u64 bhardlimit; /* absolute limit in bytes */ ++ __u64 bsoftlimit; /* preferred limit in bytes */ ++ time_t btime; /* time limit for excessive disk use */ ++ __u64 bcurrent; /* current bytes count */ ++ /* inodes limits */ ++ __u32 ihardlimit; /* absolute limit on allocated inodes */ ++ __u32 isoftlimit; /* preferred inode limit */ ++ time_t itime; /* time limit for excessive inode use */ ++ __u32 icurrent; /* current # allocated inodes */ ++}; ++ ++/* Values for dq_info->flags */ ++#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */ ++#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */ ++ ++struct dq_info { ++ time_t bexpire; /* expire timeout for excessive disk use */ ++ time_t iexpire; /* expire timeout for excessive inode use */ ++ unsigned flags; /* see previos defines */ ++}; ++ ++struct vz_quota_stat { ++ struct dq_stat dq_stat; ++ struct dq_info dq_info; ++}; ++ ++/* UID/GID interface record - for user-kernel level exchange */ ++struct vz_quota_iface { ++ unsigned int qi_id; /* UID/GID this applies to */ ++ unsigned int qi_type; /* USRQUOTA|GRPQUOTA */ ++ struct dq_stat qi_stat; /* limits, options, usage stats */ ++}; ++ ++/* values for flags and dq_flags */ ++/* this flag is set if the userspace has been unable to provide usage ++ * information about all ugids ++ * if the flag is set, we don't allocate new UG quota blocks (their ++ * current usage is unknown) or free existing UG quota blocks (not to ++ * lose information that this block is ok) */ ++#define VZDQUG_FIXED_SET 0x01 ++/* permit to use ugid quota */ ++#define VZDQUG_ON 0x02 ++#define VZDQ_USRQUOTA 0x10 ++#define VZDQ_GRPQUOTA 0x20 ++#define VZDQ_NOACT 0x1000 /* not actual */ ++#define VZDQ_NOQUOT 0x2000 /* not under quota tree */ ++ ++struct vz_quota_ugid_stat { ++ unsigned int limit; /* max amount of ugid records */ ++ unsigned int count; /* amount of ugid records */ ++ unsigned int flags; ++}; ++ ++struct vz_quota_ugid_setlimit { ++ unsigned int type; /* quota type (USR/GRP) */ ++ unsigned int id; /* ugid */ ++ struct if_dqblk dqb; /* limits info */ ++}; ++ ++struct vz_quota_ugid_setinfo { ++ unsigned int type; /* quota type (USR/GRP) */ ++ struct if_dqinfo dqi; /* grace info */ ++}; ++ ++#ifdef __KERNEL__ ++#include <linux/list.h> ++#include <asm/atomic.h> ++#include <asm/semaphore.h> ++#include <linux/time.h> ++#include <linux/vzquota_qlnk.h> ++#include <linux/vzdq_tree.h> ++ ++/* Values for dq_info flags */ ++#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */ ++#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */ ++ ++/* values for dq_state */ ++#define VZDQ_STARTING 0 /* created, not turned on yet */ ++#define VZDQ_WORKING 1 /* quota created, turned on */ ++#define VZDQ_STOPING 2 /* created, turned on and off */ ++ ++/* master quota record - one per veid */ ++struct vz_quota_master { ++ struct list_head dq_hash; /* next quota in hash list */ ++ atomic_t dq_count; /* inode reference count */ ++ unsigned int dq_flags; /* see VZDQUG_FIXED_SET */ ++ unsigned int dq_state; /* see values above */ ++ unsigned int dq_id; /* VEID this applies to */ ++ struct dq_stat dq_stat; /* limits, grace, usage stats */ ++ struct dq_info dq_info; /* grace times and flags */ ++ spinlock_t dq_data_lock; /* for dq_stat */ ++ ++ struct semaphore dq_sem; /* semaphore to protect ++ ugid tree */ ++ ++ struct list_head dq_ilink_list; /* list of vz_quota_ilink */ ++ struct quotatree_tree *dq_uid_tree; /* vz_quota_ugid tree for UIDs */ ++ struct quotatree_tree *dq_gid_tree; /* vz_quota_ugid tree for GIDs */ ++ unsigned int dq_ugid_count; /* amount of ugid records */ ++ unsigned int dq_ugid_max; /* max amount of ugid records */ ++ struct dq_info dq_ugid_info[MAXQUOTAS]; /* ugid grace times */ ++ ++ struct dentry *dq_root_dentry;/* dentry of fs tree */ ++ struct vfsmount *dq_root_mnt; /* vfsmnt of this dentry */ ++ struct super_block *dq_sb; /* superblock of our quota root */ ++}; ++ ++/* UID/GID quota record - one per pair (quota_master, uid or gid) */ ++struct vz_quota_ugid { ++ unsigned int qugid_id; /* UID/GID this applies to */ ++ struct dq_stat qugid_stat; /* limits, options, usage stats */ ++ int qugid_type; /* USRQUOTA|GRPQUOTA */ ++ atomic_t qugid_count; /* reference count */ ++}; ++ ++#define VZ_QUOTA_UGBAD ((struct vz_quota_ugid *)0xfeafea11) ++ ++struct vz_quota_datast { ++ struct vz_quota_ilink qlnk; ++}; ++ ++#define VIRTINFO_QUOTA_GETSTAT 0 ++#define VIRTINFO_QUOTA_ON 1 ++#define VIRTINFO_QUOTA_OFF 2 ++ ++struct virt_info_quota { ++ struct super_block *super; ++ struct dq_stat *qstat; ++}; ++ ++/* ++ * Interface to VZ quota core ++ */ ++#define INODE_QLNK(inode) (&(inode)->i_qlnk) ++#define QLNK_INODE(qlnk) container_of((qlnk), struct inode, i_qlnk) ++ ++#define VZ_QUOTA_BAD ((struct vz_quota_master *)0xefefefef) ++ ++#define VZ_QUOTAO_SETE 1 ++#define VZ_QUOTAO_INIT 2 ++#define VZ_QUOTAO_DESTR 3 ++#define VZ_QUOTAO_SWAP 4 ++#define VZ_QUOTAO_INICAL 5 ++#define VZ_QUOTAO_DRCAL 6 ++#define VZ_QUOTAO_QSET 7 ++#define VZ_QUOTAO_TRANS 8 ++#define VZ_QUOTAO_ACT 9 ++#define VZ_QUOTAO_DTREE 10 ++#define VZ_QUOTAO_DET 11 ++#define VZ_QUOTAO_ON 12 ++ ++extern struct semaphore vz_quota_sem; ++void inode_qmblk_lock(struct super_block *sb); ++void inode_qmblk_unlock(struct super_block *sb); ++void qmblk_data_read_lock(struct vz_quota_master *qmblk); ++void qmblk_data_read_unlock(struct vz_quota_master *qmblk); ++void qmblk_data_write_lock(struct vz_quota_master *qmblk); ++void qmblk_data_write_unlock(struct vz_quota_master *qmblk); ++ ++/* for quota operations */ ++void vzquota_inode_init_call(struct inode *inode); ++void vzquota_inode_drop_call(struct inode *inode); ++int vzquota_inode_transfer_call(struct inode *, struct iattr *); ++struct vz_quota_master *vzquota_inode_data(struct inode *inode, ++ struct vz_quota_datast *); ++void vzquota_data_unlock(struct inode *inode, struct vz_quota_datast *); ++int vzquota_rename_check(struct inode *inode, ++ struct inode *old_dir, struct inode *new_dir); ++struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode); ++/* for second-level quota */ ++struct vz_quota_master *vzquota_find_qmblk(struct super_block *); ++/* for management operations */ ++struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id, ++ struct vz_quota_stat *qstat); ++void vzquota_free_master(struct vz_quota_master *); ++struct vz_quota_master *vzquota_find_master(unsigned int quota_id); ++int vzquota_on_qmblk(struct super_block *sb, struct inode *inode, ++ struct vz_quota_master *qmblk); ++int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk); ++int vzquota_get_super(struct super_block *sb); ++void vzquota_put_super(struct super_block *sb); ++ ++static inline struct vz_quota_master *qmblk_get(struct vz_quota_master *qmblk) ++{ ++ if (!atomic_read(&qmblk->dq_count)) ++ BUG(); ++ atomic_inc(&qmblk->dq_count); ++ return qmblk; ++} ++ ++static inline void __qmblk_put(struct vz_quota_master *qmblk) ++{ ++ atomic_dec(&qmblk->dq_count); ++} ++ ++static inline void qmblk_put(struct vz_quota_master *qmblk) ++{ ++ if (!atomic_dec_and_test(&qmblk->dq_count)) ++ return; ++ vzquota_free_master(qmblk); ++} ++ ++extern struct list_head vzquota_hash_table[]; ++extern int vzquota_hash_size; ++ ++/* ++ * Interface to VZ UGID quota ++ */ ++extern struct quotactl_ops vz_quotactl_operations; ++extern struct dquot_operations vz_quota_operations2; ++extern struct quota_format_type vz_quota_empty_v2_format; ++ ++#define QUGID_TREE(qmblk, type) (((type) == USRQUOTA) ? \ ++ qmblk->dq_uid_tree : \ ++ qmblk->dq_gid_tree) ++ ++#define VZDQUG_FIND_DONT_ALLOC 1 ++#define VZDQUG_FIND_FAKE 2 ++struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk, ++ unsigned int quota_id, int type, int flags); ++struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk, ++ unsigned int quota_id, int type, int flags); ++struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid); ++void vzquota_put_ugid(struct vz_quota_master *qmblk, ++ struct vz_quota_ugid *qugid); ++void vzquota_kill_ugid(struct vz_quota_master *qmblk); ++int vzquota_ugid_init(void); ++void vzquota_ugid_release(void); ++int vzquota_transfer_usage(struct inode *inode, int mask, ++ struct vz_quota_ilink *qlnk); ++ ++struct vzctl_quotaugidctl; ++long do_vzquotaugidctl(struct vzctl_quotaugidctl *qub); ++ ++/* ++ * Other VZ quota parts ++ */ ++extern struct dquot_operations vz_quota_operations; ++ ++long do_vzquotactl(int cmd, unsigned int quota_id, ++ struct vz_quota_stat *qstat, const char *ve_root); ++int vzquota_proc_init(void); ++void vzquota_proc_release(void); ++struct vz_quota_master *vzquota_find_qmblk(struct super_block *); ++extern struct semaphore vz_quota_sem; ++ ++void vzaquota_init(void); ++void vzaquota_fini(void); ++ ++#endif /* __KERNEL__ */ ++ ++#endif /* _VZDQUOTA_H */ +diff -uprN linux-2.6.15.orig/include/linux/vzquota_qlnk.h linux-2.6.15-ve025stab014/include/linux/vzquota_qlnk.h +--- linux-2.6.15.orig/include/linux/vzquota_qlnk.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/vzquota_qlnk.h 2006-01-27 14:48:09.000000000 +0300 +@@ -0,0 +1,25 @@ ++/* ++ * include/linux/vzquota_qlnk.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _VZDQUOTA_QLNK_H ++#define _VZDQUOTA_QLNK_H ++ ++struct vz_quota_master; ++struct vz_quota_ugid; ++ ++/* inode link, used to track inodes using quota via dq_ilink_list */ ++struct vz_quota_ilink { ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid *qugid[MAXQUOTAS]; ++ struct list_head list; ++ unsigned char origin; ++}; ++ ++#endif /* _VZDQUOTA_QLNK_H */ +diff -uprN linux-2.6.15.orig/include/linux/vzratelimit.h linux-2.6.15-ve025stab014/include/linux/vzratelimit.h +--- linux-2.6.15.orig/include/linux/vzratelimit.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/vzratelimit.h 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,28 @@ ++/* ++ * include/linux/vzratelimit.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __VZ_RATELIMIT_H__ ++#define __VZ_RATELIMIT_H__ ++ ++/* ++ * Generic ratelimiting stuff. ++ */ ++ ++struct vz_rate_info { ++ int burst; ++ int interval; /* jiffy_t per event */ ++ int bucket; /* kind of leaky bucket */ ++ unsigned long last; /* last event */ ++}; ++ ++/* Return true if rate limit permits. */ ++int vz_ratelimit(struct vz_rate_info *p); ++ ++#endif /* __VZ_RATELIMIT_H__ */ +diff -uprN linux-2.6.15.orig/include/linux/vzstat.h linux-2.6.15-ve025stab014/include/linux/vzstat.h +--- linux-2.6.15.orig/include/linux/vzstat.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/linux/vzstat.h 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,182 @@ ++/* ++ * include/linux/vzstat.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __VZSTAT_H__ ++#define __VZSTAT_H__ ++ ++struct swap_cache_info_struct { ++ unsigned long add_total; ++ unsigned long del_total; ++ unsigned long find_success; ++ unsigned long find_total; ++ unsigned long noent_race; ++ unsigned long exist_race; ++ unsigned long remove_race; ++}; ++ ++struct kstat_lat_snap_struct { ++ cycles_t maxlat, totlat; ++ unsigned long count; ++}; ++struct kstat_lat_pcpu_snap_struct { ++ cycles_t maxlat, totlat; ++ unsigned long count; ++ seqcount_t lock; ++} ____cacheline_maxaligned_in_smp; ++ ++struct kstat_lat_struct { ++ struct kstat_lat_snap_struct cur, last; ++ cycles_t avg[3]; ++}; ++struct kstat_lat_pcpu_struct { ++ struct kstat_lat_pcpu_snap_struct cur[NR_CPUS]; ++ cycles_t max_snap; ++ struct kstat_lat_snap_struct last; ++ cycles_t avg[3]; ++}; ++ ++struct kstat_perf_snap_struct { ++ cycles_t wall_tottime, cpu_tottime; ++ cycles_t wall_maxdur, cpu_maxdur; ++ unsigned long count; ++}; ++struct kstat_perf_struct { ++ struct kstat_perf_snap_struct cur, last; ++}; ++ ++struct kstat_zone_avg { ++ unsigned long free_pages_avg[3], ++ nr_active_avg[3], ++ nr_inactive_avg[3]; ++}; ++ ++#define KSTAT_ALLOCSTAT_NR 5 ++ ++struct kernel_stat_glob { ++ unsigned long nr_unint_avg[3]; ++ ++ unsigned long alloc_fails[KSTAT_ALLOCSTAT_NR]; ++ struct kstat_lat_struct alloc_lat[KSTAT_ALLOCSTAT_NR]; ++ struct kstat_lat_pcpu_struct sched_lat; ++ struct kstat_lat_struct swap_in; ++ ++ struct kstat_perf_struct ttfp, cache_reap, ++ refill_inact, shrink_icache, shrink_dcache; ++ ++ struct kstat_zone_avg zone_avg[3]; /* MAX_NR_ZONES */ ++} ____cacheline_aligned; ++ ++extern struct kernel_stat_glob kstat_glob ____cacheline_aligned; ++extern spinlock_t kstat_glb_lock; ++ ++#ifdef CONFIG_VE ++#define KSTAT_PERF_ENTER(name) \ ++ unsigned long flags; \ ++ cycles_t start, sleep_time; \ ++ \ ++ start = get_cycles(); \ ++ sleep_time = VE_TASK_INFO(current)->sleep_time; \ ++ ++#define KSTAT_PERF_LEAVE(name) \ ++ spin_lock_irqsave(&kstat_glb_lock, flags); \ ++ kstat_glob.name.cur.count++; \ ++ start = get_cycles() - start; \ ++ if (kstat_glob.name.cur.wall_maxdur < start) \ ++ kstat_glob.name.cur.wall_maxdur = start;\ ++ kstat_glob.name.cur.wall_tottime += start; \ ++ start -= VE_TASK_INFO(current)->sleep_time - \ ++ sleep_time; \ ++ if (kstat_glob.name.cur.cpu_maxdur < start) \ ++ kstat_glob.name.cur.cpu_maxdur = start; \ ++ kstat_glob.name.cur.cpu_tottime += start; \ ++ spin_unlock_irqrestore(&kstat_glb_lock, flags); \ ++ ++#else ++#define KSTAT_PERF_ENTER(name) ++#define KSTAT_PERF_LEAVE(name) ++#endif ++ ++/* ++ * Add another statistics reading. ++ * Serialization is the caller's due. ++ */ ++static inline void KSTAT_LAT_ADD(struct kstat_lat_struct *p, ++ cycles_t dur) ++{ ++ p->cur.count++; ++ if (p->cur.maxlat < dur) ++ p->cur.maxlat = dur; ++ p->cur.totlat += dur; ++} ++ ++static inline void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, int cpu, ++ cycles_t dur) ++{ ++ struct kstat_lat_pcpu_snap_struct *cur; ++ ++ cur = &p->cur[cpu]; ++ write_seqcount_begin(&cur->lock); ++ cur->count++; ++ if (cur->maxlat < dur) ++ cur->maxlat = dur; ++ cur->totlat += dur; ++ write_seqcount_end(&cur->lock); ++} ++ ++/* ++ * Move current statistics to last, clear last. ++ * Serialization is the caller's due. ++ */ ++static inline void KSTAT_LAT_UPDATE(struct kstat_lat_struct *p) ++{ ++ cycles_t m; ++ memcpy(&p->last, &p->cur, sizeof(p->last)); ++ p->cur.maxlat = 0; ++ m = p->last.maxlat; ++ CALC_LOAD(p->avg[0], EXP_1, m) ++ CALC_LOAD(p->avg[1], EXP_5, m) ++ CALC_LOAD(p->avg[2], EXP_15, m) ++} ++ ++static inline void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p) ++{ ++ unsigned i, cpu; ++ struct kstat_lat_pcpu_snap_struct snap, *cur; ++ cycles_t m; ++ ++ memset(&p->last, 0, sizeof(p->last)); ++ for (cpu = 0; cpu < NR_CPUS; cpu++) { ++ cur = &p->cur[cpu]; ++ do { ++ i = read_seqcount_begin(&cur->lock); ++ memcpy(&snap, cur, sizeof(snap)); ++ } while (read_seqcount_retry(&cur->lock, i)); ++ /* ++ * read above and this update of maxlat is not atomic, ++ * but this is OK, since it happens rarely and losing ++ * a couple of peaks is not essential. xemul ++ */ ++ cur->maxlat = 0; ++ ++ p->last.count += snap.count; ++ p->last.totlat += snap.totlat; ++ if (p->last.maxlat < snap.maxlat) ++ p->last.maxlat = snap.maxlat; ++ } ++ ++ m = (p->last.maxlat > p->max_snap ? p->last.maxlat : p->max_snap); ++ CALC_LOAD(p->avg[0], EXP_1, m); ++ CALC_LOAD(p->avg[1], EXP_5, m); ++ CALC_LOAD(p->avg[2], EXP_15, m); ++ /* reset max_snap to calculate it correctly next time */ ++ p->max_snap = 0; ++} ++ ++#endif /* __VZSTAT_H__ */ +diff -uprN linux-2.6.15.orig/include/net/af_unix.h linux-2.6.15-ve025stab014/include/net/af_unix.h +--- linux-2.6.15.orig/include/net/af_unix.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/net/af_unix.h 2006-01-27 14:48:08.000000000 +0300 +@@ -19,23 +19,37 @@ extern atomic_t unix_tot_inflight; + + static inline struct sock *first_unix_socket(int *i) + { ++ struct sock *s; ++ struct ve_struct *ve; ++ ++ ve = get_exec_env(); + for (*i = 0; *i <= UNIX_HASH_SIZE; (*i)++) { +- if (!hlist_empty(&unix_socket_table[*i])) +- return __sk_head(&unix_socket_table[*i]); ++ for (s = sk_head(&unix_socket_table[*i]); ++ s != NULL && !ve_accessible(s->sk_owner_env, ve); ++ s = sk_next(s)); ++ if (s != NULL) ++ return s; + } + return NULL; + } + + static inline struct sock *next_unix_socket(int *i, struct sock *s) + { +- struct sock *next = sk_next(s); +- /* More in this chain? */ +- if (next) +- return next; ++ struct ve_struct *ve; ++ ++ ve = get_exec_env(); ++ for (s = sk_next(s); s != NULL; s = sk_next(s)) { ++ if (!ve_accessible(s->sk_owner_env, ve)) ++ continue; ++ return s; ++ } + /* Look for next non-empty chain. */ + for ((*i)++; *i <= UNIX_HASH_SIZE; (*i)++) { +- if (!hlist_empty(&unix_socket_table[*i])) +- return __sk_head(&unix_socket_table[*i]); ++ for (s = sk_head(&unix_socket_table[*i]); ++ s != NULL && !ve_accessible(s->sk_owner_env, ve); ++ s = sk_next(s)); ++ if (s != NULL) ++ return s; + } + return NULL; + } +diff -uprN linux-2.6.15.orig/include/net/flow.h linux-2.6.15-ve025stab014/include/net/flow.h +--- linux-2.6.15.orig/include/net/flow.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/net/flow.h 2006-01-27 14:48:08.000000000 +0300 +@@ -10,6 +10,7 @@ + #include <linux/in6.h> + #include <asm/atomic.h> + ++struct ve_struct; + struct flowi { + int oif; + int iif; +@@ -78,6 +79,9 @@ struct flowi { + #define fl_icmp_type uli_u.icmpt.type + #define fl_icmp_code uli_u.icmpt.code + #define fl_ipsec_spi uli_u.spi ++#ifdef CONFIG_VE ++ struct ve_struct *owner_env; ++#endif + } __attribute__((__aligned__(BITS_PER_LONG/8))); + + #define FLOW_DIR_IN 0 +diff -uprN linux-2.6.15.orig/include/net/icmp.h linux-2.6.15-ve025stab014/include/net/icmp.h +--- linux-2.6.15.orig/include/net/icmp.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/net/icmp.h 2006-01-27 14:48:08.000000000 +0300 +@@ -34,9 +34,14 @@ struct icmp_err { + + extern struct icmp_err icmp_err_convert[]; + DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics); +-#define ICMP_INC_STATS(field) SNMP_INC_STATS(icmp_statistics, field) +-#define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmp_statistics, field) +-#define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmp_statistics, field) ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ve_icmp_statistics (get_exec_env()->_icmp_statistics) ++#else ++#define ve_icmp_statistics icmp_statistics ++#endif ++#define ICMP_INC_STATS(field) SNMP_INC_STATS(ve_icmp_statistics, field) ++#define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_icmp_statistics, field) ++#define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_icmp_statistics, field) + + extern void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info); + extern int icmp_rcv(struct sk_buff *skb); +diff -uprN linux-2.6.15.orig/include/net/inet_hashtables.h linux-2.6.15-ve025stab014/include/net/inet_hashtables.h +--- linux-2.6.15.orig/include/net/inet_hashtables.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/net/inet_hashtables.h 2006-01-27 14:48:08.000000000 +0300 +@@ -24,6 +24,7 @@ + #include <linux/spinlock.h> + #include <linux/types.h> + #include <linux/wait.h> ++#include <linux/ve_owner.h> + + #include <net/inet_connection_sock.h> + #include <net/route.h> +@@ -74,11 +75,13 @@ struct inet_ehash_bucket { + * ports are created in O(1) time? I thought so. ;-) -DaveM + */ + struct inet_bind_bucket { ++ struct ve_struct *owner_env; + unsigned short port; + signed short fastreuse; + struct hlist_node node; + struct hlist_head owners; + }; ++DCL_VE_OWNER_PROTO(TB, struct inet_bind_bucket, owner_env) + + #define inet_bind_bucket_for_each(tb, node, head) \ + hlist_for_each_entry(tb, node, head, node) +@@ -129,9 +132,10 @@ struct inet_hashinfo { + }; + + static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport, +- const __u32 faddr, const __u16 fport) ++ const __u32 faddr, const __u16 fport, ++ const envid_t veid) + { +- unsigned int h = (laddr ^ lport) ^ (faddr ^ fport); ++ int h = (laddr ^ lport) ^ (faddr ^ fport) ^ (veid ^ (veid >> 16)); + h ^= h >> 16; + h ^= h >> 8; + return h; +@@ -144,8 +148,9 @@ static inline int inet_sk_ehashfn(const + const __u16 lport = inet->num; + const __u32 faddr = inet->daddr; + const __u16 fport = inet->dport; ++ envid_t veid = VEID(VE_OWNER_SK(sk)); + +- return inet_ehashfn(laddr, lport, faddr, fport); ++ return inet_ehashfn(laddr, lport, faddr, fport, veid); + } + + static inline struct inet_ehash_bucket *inet_ehash_bucket( +@@ -158,37 +163,43 @@ static inline struct inet_ehash_bucket * + extern struct inet_bind_bucket * + inet_bind_bucket_create(kmem_cache_t *cachep, + struct inet_bind_hashbucket *head, +- const unsigned short snum); ++ const unsigned short snum, ++ struct ve_struct *env); + extern void inet_bind_bucket_destroy(kmem_cache_t *cachep, + struct inet_bind_bucket *tb); + +-static inline int inet_bhashfn(const __u16 lport, const int bhash_size) ++static inline int inet_bhashfn(const __u16 lport, const int bhash_size, ++ unsigned veid) + { +- return lport & (bhash_size - 1); ++ return ((lport + (veid ^ (veid >> 16))) & (bhash_size - 1)); + } + + extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, + const unsigned short snum); + + /* These can have wildcards, don't try too hard. */ +-static inline int inet_lhashfn(const unsigned short num) ++static inline int inet_lhashfn(const unsigned short num, unsigned veid) + { +- return num & (INET_LHTABLE_SIZE - 1); ++ return ((num + (veid ^ (veid >> 16))) & (INET_LHTABLE_SIZE - 1)); + } + + static inline int inet_sk_listen_hashfn(const struct sock *sk) + { +- return inet_lhashfn(inet_sk(sk)->num); ++ return inet_lhashfn(inet_sk(sk)->num, VEID(VE_OWNER_SK(sk))); + } + + /* Caller must disable local BH processing. */ + static inline void __inet_inherit_port(struct inet_hashinfo *table, + struct sock *sk, struct sock *child) + { +- const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size); +- struct inet_bind_hashbucket *head = &table->bhash[bhash]; ++ int bhash; ++ struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + ++ bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size, ++ VEID(VE_OWNER_SK(child))); ++ head = &table->bhash[bhash]; ++ + spin_lock(&head->lock); + tb = inet_csk(sk)->icsk_bind_hash; + sk_add_bind_node(child, &tb->owners); +@@ -294,7 +305,8 @@ static inline int inet_iif(const struct + extern struct sock *__inet_lookup_listener(const struct hlist_head *head, + const u32 daddr, + const unsigned short hnum, +- const int dif); ++ const int dif, ++ struct ve_struct *env); + + /* Optimize the common listener case. */ + static inline struct sock * +@@ -304,18 +316,21 @@ static inline struct sock * + { + struct sock *sk = NULL; + const struct hlist_head *head; ++ struct ve_struct *env; + ++ env = get_exec_env(); + read_lock(&hashinfo->lhash_lock); +- head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; ++ head = &hashinfo->listening_hash[inet_lhashfn(hnum, VEID(env))]; + if (!hlist_empty(head)) { + const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); + + if (inet->num == hnum && !sk->sk_node.next && ++ ve_accessible_strict(VE_OWNER_SK(sk), env) && + (!inet->rcv_saddr || inet->rcv_saddr == daddr) && + (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && + !sk->sk_bound_dev_if) + goto sherry_cache; +- sk = __inet_lookup_listener(head, daddr, hnum, dif); ++ sk = __inet_lookup_listener(head, daddr, hnum, dif, env); + } + if (sk) { + sherry_cache: +@@ -342,25 +357,25 @@ sherry_cache: + #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ + const __u64 __name = (((__u64)(__daddr)) << 32) | ((__u64)(__saddr)); + #endif /* __BIG_ENDIAN */ +-#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ ++#define INET_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ + (((__sk)->sk_hash == (__hash)) && \ + ((*((__u64 *)&(inet_sk(__sk)->daddr))) == (__cookie)) && \ + ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) +-#define INET_TW_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ ++#define INET_TW_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ + (((__sk)->sk_hash == (__hash)) && \ + ((*((__u64 *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) && \ + ((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) + #else /* 32-bit arch */ + #define INET_ADDR_COOKIE(__name, __saddr, __daddr) +-#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif) \ ++#define INET_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif) \ + (((__sk)->sk_hash == (__hash)) && \ + (inet_sk(__sk)->daddr == (__saddr)) && \ + (inet_sk(__sk)->rcv_saddr == (__daddr)) && \ + ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) +-#define INET_TW_MATCH(__sk, __hash,__cookie, __saddr, __daddr, __ports, __dif) \ ++#define INET_TW_MATCH_ALLVE(__sk, __hash,__cookie, __saddr, __daddr, __ports, __dif) \ + (((__sk)->sk_hash == (__hash)) && \ + (inet_twsk(__sk)->tw_daddr == (__saddr)) && \ + (inet_twsk(__sk)->tw_rcv_saddr == (__daddr)) && \ +@@ -368,6 +383,18 @@ sherry_cache: + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) + #endif /* 64-bit arch */ + ++#define INET_MATCH(__sk, __hash, __cookie, __saddr, \ ++ __daddr, __ports, __dif, __ve) \ ++ (INET_MATCH_ALLVE((__sk), (__hash), (__cookie), (__saddr), \ ++ (__daddr), (__ports), (__dif)) \ ++ && ve_accessible_strict(VE_OWNER_SK(__sk), (__ve))) ++ ++#define INET_TW_MATCH(__sk, __hash, __cookie, __saddr, \ ++ __daddr, __ports, __dif, __ve) \ ++ (INET_TW_MATCH_ALLVE((__sk), (__hash), (__cookie), (__saddr), \ ++ (__daddr), (__ports), (__dif)) \ ++ && ve_accessible_strict(inet_twsk(__sk)->tw_owner_env, VEID(__ve))) ++ + /* + * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need + * not check it for lookups anymore, thanks Alexey. -DaveM +@@ -387,19 +414,25 @@ static inline struct sock * + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ +- unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); +- struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); +- ++ unsigned int hash; ++ struct inet_ehash_bucket *head; ++ struct ve_struct *env; ++ ++ env = get_exec_env(); ++ hash = inet_ehashfn(daddr, hnum, saddr, sport, VEID(env)); ++ head = inet_ehash_bucket(hashinfo, hash); + prefetch(head->chain.first); + read_lock(&head->lock); + sk_for_each(sk, node, &head->chain) { +- if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) ++ if (INET_MATCH(sk, hash, acookie, saddr, daddr, ++ ports, dif, env)) + goto hit; /* You sunk my battleship! */ + } + + /* Must check for a TIME_WAIT'er before going to listener hash. */ + sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) { +- if (INET_TW_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) ++ if (INET_TW_MATCH(sk, hash, acookie, saddr, daddr, ++ ports, dif, env)) + goto hit; + } + sk = NULL; +diff -uprN linux-2.6.15.orig/include/net/inet_timewait_sock.h linux-2.6.15-ve025stab014/include/net/inet_timewait_sock.h +--- linux-2.6.15.orig/include/net/inet_timewait_sock.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/net/inet_timewait_sock.h 2006-01-27 14:48:08.000000000 +0300 +@@ -132,6 +132,7 @@ struct inet_timewait_sock { + unsigned long tw_ttd; + struct inet_bind_bucket *tw_tb; + struct hlist_node tw_death_node; ++ envid_t tw_owner_env; + }; + + static inline void inet_twsk_add_node(struct inet_timewait_sock *tw, +diff -uprN linux-2.6.15.orig/include/net/ip.h linux-2.6.15-ve025stab014/include/net/ip.h +--- linux-2.6.15.orig/include/net/ip.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/net/ip.h 2006-01-27 14:48:08.000000000 +0300 +@@ -149,15 +149,25 @@ struct ipv4_config + + extern struct ipv4_config ipv4_config; + DECLARE_SNMP_STAT(struct ipstats_mib, ip_statistics); +-#define IP_INC_STATS(field) SNMP_INC_STATS(ip_statistics, field) +-#define IP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ip_statistics, field) +-#define IP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ip_statistics, field) ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ve_ip_statistics (get_exec_env()->_ip_statistics) ++#else ++#define ve_ip_statistics ip_statistics ++#endif ++#define IP_INC_STATS(field) SNMP_INC_STATS(ve_ip_statistics, field) ++#define IP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_ip_statistics, field) ++#define IP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_ip_statistics, field) + DECLARE_SNMP_STAT(struct linux_mib, net_statistics); +-#define NET_INC_STATS(field) SNMP_INC_STATS(net_statistics, field) +-#define NET_INC_STATS_BH(field) SNMP_INC_STATS_BH(net_statistics, field) +-#define NET_INC_STATS_USER(field) SNMP_INC_STATS_USER(net_statistics, field) +-#define NET_ADD_STATS_BH(field, adnd) SNMP_ADD_STATS_BH(net_statistics, field, adnd) +-#define NET_ADD_STATS_USER(field, adnd) SNMP_ADD_STATS_USER(net_statistics, field, adnd) ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ve_net_statistics (get_exec_env()->_net_statistics) ++#else ++#define ve_net_statistics net_statistics ++#endif ++#define NET_INC_STATS(field) SNMP_INC_STATS(ve_net_statistics, field) ++#define NET_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_net_statistics, field) ++#define NET_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_net_statistics, field) ++#define NET_ADD_STATS_BH(field, adnd) SNMP_ADD_STATS_BH(ve_net_statistics, field, adnd) ++#define NET_ADD_STATS_USER(field, adnd) SNMP_ADD_STATS_USER(ve_net_statistics, field, adnd) + + extern int sysctl_local_port_range[2]; + extern int sysctl_ip_default_ttl; +@@ -375,4 +385,11 @@ extern int ip_misc_proc_init(void); + + extern struct ctl_table ipv4_table[]; + ++#ifdef CONFIG_SYSCTL ++extern int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, ++ void __user *buffer, size_t *lenp, loff_t *ppos); ++extern int ipv4_sysctl_forward_strategy(ctl_table *table, int __user *name, ++ int nlen, void __user *oldval, size_t __user *oldlenp, ++ void __user *newval, size_t newlen, void **context); ++#endif + #endif /* _IP_H */ +diff -uprN linux-2.6.15.orig/include/net/ip_fib.h linux-2.6.15-ve025stab014/include/net/ip_fib.h +--- linux-2.6.15.orig/include/net/ip_fib.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/net/ip_fib.h 2006-01-27 14:48:08.000000000 +0300 +@@ -168,10 +168,22 @@ struct fib_table { + unsigned char tb_data[0]; + }; + ++struct fn_zone; ++struct fn_hash ++{ ++ struct fn_zone *fn_zones[33]; ++ struct fn_zone *fn_zone_list; ++}; ++ + #ifndef CONFIG_IP_MULTIPLE_TABLES + ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ip_fib_local_table get_exec_env()->_local_table ++#define ip_fib_main_table get_exec_env()->_main_table ++#else + extern struct fib_table *ip_fib_local_table; + extern struct fib_table *ip_fib_main_table; ++#endif + + static inline struct fib_table *fib_get_table(int id) + { +@@ -203,7 +215,12 @@ static inline void fib_select_default(co + #define ip_fib_local_table (fib_tables[RT_TABLE_LOCAL]) + #define ip_fib_main_table (fib_tables[RT_TABLE_MAIN]) + ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define fib_tables get_exec_env()->_fib_tables ++#else + extern struct fib_table * fib_tables[RT_TABLE_MAX+1]; ++#endif ++ + extern int fib_lookup(const struct flowi *flp, struct fib_result *res); + extern struct fib_table *__fib_new_table(int id); + extern void fib_rule_put(struct fib_rule *r); +@@ -248,10 +265,19 @@ extern u32 __fib_res_prefsrc(struct fib + + /* Exported by fib_hash.c */ + extern struct fib_table *fib_hash_init(int id); ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++struct ve_struct; ++extern int init_ve_route(struct ve_struct *ve); ++extern void fini_ve_route(struct ve_struct *ve); ++#else ++#define init_ve_route(ve) (0) ++#define fini_ve_route(ve) do { } while (0) ++#endif + + #ifdef CONFIG_IP_MULTIPLE_TABLES + /* Exported by fib_rules.c */ +- ++extern int fib_rules_create(void); ++extern void fib_rules_destroy(void); + extern int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); + extern int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); + extern int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb); +diff -uprN linux-2.6.15.orig/include/net/route.h linux-2.6.15-ve025stab014/include/net/route.h +--- linux-2.6.15.orig/include/net/route.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/net/route.h 2006-01-27 14:48:08.000000000 +0300 +@@ -200,4 +200,14 @@ static inline struct inet_peer *rt_get_p + + extern ctl_table ipv4_route_table[]; + ++#ifdef CONFIG_SYSCTL ++extern int ipv4_flush_delay; ++extern int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, ++ struct file *filp, void __user *buffer, size_t *lenp, ++ loff_t *ppos); ++extern int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, ++ int __user *name, int nlen, void __user *oldval, ++ size_t __user *oldlenp, void __user *newval, ++ size_t newlen, void **context); ++#endif + #endif /* _ROUTE_H */ +diff -uprN linux-2.6.15.orig/include/net/scm.h linux-2.6.15-ve025stab014/include/net/scm.h +--- linux-2.6.15.orig/include/net/scm.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/net/scm.h 2006-01-27 14:48:08.000000000 +0300 +@@ -40,7 +40,7 @@ static __inline__ int scm_send(struct so + memset(scm, 0, sizeof(*scm)); + scm->creds.uid = current->uid; + scm->creds.gid = current->gid; +- scm->creds.pid = current->tgid; ++ scm->creds.pid = virt_tgid(current); + if (msg->msg_controllen <= 0) + return 0; + return __scm_send(sock, msg, scm); +diff -uprN linux-2.6.15.orig/include/net/sock.h linux-2.6.15-ve025stab014/include/net/sock.h +--- linux-2.6.15.orig/include/net/sock.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/net/sock.h 2006-01-27 14:48:08.000000000 +0300 +@@ -55,6 +55,8 @@ + #include <net/dst.h> + #include <net/checksum.h> + ++#include <ub/ub_net.h> ++ + /* + * This structure really needs to be cleaned up. + * Most of it is for TCP, and not used by any of +@@ -251,8 +253,12 @@ struct sock { + int (*sk_backlog_rcv)(struct sock *sk, + struct sk_buff *skb); + void (*sk_destruct)(struct sock *sk); ++ struct sock_beancounter sk_bc; ++ struct ve_struct *sk_owner_env; + }; + ++DCL_VE_OWNER_PROTO(SK, struct sock, sk_owner_env) ++ + /* + * Hashed lists helper routines + */ +@@ -485,7 +491,8 @@ static inline void sk_add_backlog(struct + }) + + extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p); +-extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p); ++extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p, ++ unsigned long amount); + extern void sk_stream_wait_close(struct sock *sk, long timeo_p); + extern int sk_stream_error(struct sock *sk, int flags, int err); + extern void sk_stream_kill_queues(struct sock *sk); +@@ -706,8 +713,11 @@ static inline void sk_stream_writequeue_ + + static inline int sk_stream_rmem_schedule(struct sock *sk, struct sk_buff *skb) + { +- return (int)skb->truesize <= sk->sk_forward_alloc || +- sk_stream_mem_schedule(sk, skb->truesize, 1); ++ if ((int)skb->truesize > sk->sk_forward_alloc && ++ !sk_stream_mem_schedule(sk, skb->truesize, 1)) ++ /* The situation is bad according to mainstream. Den */ ++ return 0; ++ return ub_tcprcvbuf_charge(sk, skb) == 0; + } + + static inline int sk_stream_wmem_schedule(struct sock *sk, int size) +@@ -765,6 +775,11 @@ extern struct sk_buff *sock_alloc_send + unsigned long size, + int noblock, + int *errcode); ++extern struct sk_buff *sock_alloc_send_skb2(struct sock *sk, ++ unsigned long size, ++ unsigned long size2, ++ int noblock, ++ int *errcode); + extern void *sock_kmalloc(struct sock *sk, int size, + gfp_t priority); + extern void sock_kfree_s(struct sock *sk, void *mem, int size); +@@ -1119,6 +1134,10 @@ static inline int sock_queue_rcv_skb(str + goto out; + } + ++ err = ub_sockrcvbuf_charge(sk, skb); ++ if (err < 0) ++ goto out; ++ + /* It would be deadlock, if sock_queue_rcv_skb is used + with socket lock! We assume that users of this + function are lock free. +diff -uprN linux-2.6.15.orig/include/net/tcp.h linux-2.6.15-ve025stab014/include/net/tcp.h +--- linux-2.6.15.orig/include/net/tcp.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/net/tcp.h 2006-01-27 14:48:08.000000000 +0300 +@@ -40,6 +40,7 @@ + #include <net/tcp_states.h> + + #include <linux/seq_file.h> ++#include <ub/ub_net.h> + + extern struct inet_hashinfo tcp_hashinfo; + +@@ -297,12 +298,17 @@ static inline int between(__u32 seq1, __ + extern struct proto tcp_prot; + + DECLARE_SNMP_STAT(struct tcp_mib, tcp_statistics); +-#define TCP_INC_STATS(field) SNMP_INC_STATS(tcp_statistics, field) +-#define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(tcp_statistics, field) +-#define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(tcp_statistics, field) +-#define TCP_DEC_STATS(field) SNMP_DEC_STATS(tcp_statistics, field) +-#define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(tcp_statistics, field, val) +-#define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(tcp_statistics, field, val) ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ve_tcp_statistics (get_exec_env()->_tcp_statistics) ++#else ++#define ve_tcp_statistics tcp_statistics ++#endif ++#define TCP_INC_STATS(field) SNMP_INC_STATS(ve_tcp_statistics, field) ++#define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_tcp_statistics, field) ++#define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_tcp_statistics, field) ++#define TCP_DEC_STATS(field) SNMP_DEC_STATS(ve_tcp_statistics, field) ++#define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(ve_tcp_statistics, field, val) ++#define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(ve_tcp_statistics, field, val) + + extern void tcp_v4_err(struct sk_buff *skb, u32); + +diff -uprN linux-2.6.15.orig/include/net/udp.h linux-2.6.15-ve025stab014/include/net/udp.h +--- linux-2.6.15.orig/include/net/udp.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/net/udp.h 2006-01-27 14:48:08.000000000 +0300 +@@ -40,13 +40,19 @@ extern rwlock_t udp_hash_lock; + + extern int udp_port_rover; + +-static inline int udp_lport_inuse(u16 num) ++static inline int udp_hashfn(u16 num, unsigned veid) ++{ ++ return ((num + (veid ^ (veid >> 16))) & (UDP_HTABLE_SIZE - 1)); ++} ++ ++static inline int udp_lport_inuse(u16 num, struct ve_struct *env) + { + struct sock *sk; + struct hlist_node *node; + +- sk_for_each(sk, node, &udp_hash[num & (UDP_HTABLE_SIZE - 1)]) +- if (inet_sk(sk)->num == num) ++ sk_for_each(sk, node, &udp_hash[udp_hashfn(num, VEID(env))]) ++ if (inet_sk(sk)->num == num && ++ ve_accessible_strict(sk->sk_owner_env, env)) + return 1; + return 0; + } +@@ -75,9 +81,14 @@ extern unsigned int udp_poll(struct file + poll_table *wait); + + DECLARE_SNMP_STAT(struct udp_mib, udp_statistics); +-#define UDP_INC_STATS(field) SNMP_INC_STATS(udp_statistics, field) +-#define UDP_INC_STATS_BH(field) SNMP_INC_STATS_BH(udp_statistics, field) +-#define UDP_INC_STATS_USER(field) SNMP_INC_STATS_USER(udp_statistics, field) ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ve_udp_statistics (get_exec_env()->_udp_statistics) ++#else ++#define ve_udp_statistics udp_statistics ++#endif ++#define UDP_INC_STATS(field) SNMP_INC_STATS(ve_udp_statistics, field) ++#define UDP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_udp_statistics, field) ++#define UDP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_udp_statistics, field) + + /* /proc */ + struct udp_seq_afinfo { +diff -uprN linux-2.6.15.orig/include/ub/beancounter.h linux-2.6.15-ve025stab014/include/ub/beancounter.h +--- linux-2.6.15.orig/include/ub/beancounter.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/ub/beancounter.h 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,329 @@ ++/* ++ * include/ub/beancounter.h ++ * ++ * Copyright (C) 1999-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * Andrey Savochkin saw@sw-soft.com ++ * ++ */ ++ ++#ifndef _LINUX_BEANCOUNTER_H ++#define _LINUX_BEANCOUNTER_H ++ ++#include <linux/config.h> ++ ++/* ++ * Generic ratelimiting stuff. ++ */ ++ ++struct ub_rate_info { ++ int burst; ++ int interval; /* jiffy_t per event */ ++ int bucket; /* kind of leaky bucket */ ++ unsigned long last; /* last event */ ++}; ++ ++/* Return true if rate limit permits. */ ++int ub_ratelimit(struct ub_rate_info *); ++ ++ ++/* ++ * This magic is used to distinuish user beancounter and pages beancounter ++ * in struct page. page_ub and page_bc are placed in union and MAGIC ++ * ensures us that we don't use pbc as ubc in ub_page_uncharge(). ++ */ ++#define UB_MAGIC 0x62756275 ++ ++/* ++ * Resource list. ++ */ ++ ++#define UB_KMEMSIZE 0 /* Unswappable kernel memory size including ++ * struct task, page directories, etc. ++ */ ++#define UB_LOCKEDPAGES 1 /* Mlock()ed pages. */ ++#define UB_PRIVVMPAGES 2 /* Total number of pages, counting potentially ++ * private pages as private and used. ++ */ ++#define UB_SHMPAGES 3 /* IPC SHM segment size. */ ++#define UB_ZSHMPAGES 4 /* Anonymous shared memory. */ ++#define UB_NUMPROC 5 /* Number of processes. */ ++#define UB_PHYSPAGES 6 /* All resident pages, for swapout guarantee. */ ++#define UB_VMGUARPAGES 7 /* Guarantee for memory allocation, ++ * checked against PRIVVMPAGES. ++ */ ++#define UB_OOMGUARPAGES 8 /* Guarantees against OOM kill. ++ * Only limit is used, no accounting. ++ */ ++#define UB_NUMTCPSOCK 9 /* Number of TCP sockets. */ ++#define UB_NUMFLOCK 10 /* Number of file locks. */ ++#define UB_NUMPTY 11 /* Number of PTYs. */ ++#define UB_NUMSIGINFO 12 /* Number of siginfos. */ ++#define UB_TCPSNDBUF 13 /* Total size of tcp send buffers. */ ++#define UB_TCPRCVBUF 14 /* Total size of tcp receive buffers. */ ++#define UB_OTHERSOCKBUF 15 /* Total size of other socket ++ * send buffers (all buffers for PF_UNIX). ++ */ ++#define UB_DGRAMRCVBUF 16 /* Total size of other socket ++ * receive buffers. ++ */ ++#define UB_NUMOTHERSOCK 17 /* Number of other sockets. */ ++#define UB_DCACHESIZE 18 /* Size of busy dentry/inode cache. */ ++#define UB_NUMFILE 19 /* Number of open files. */ ++ ++#define UB_RESOURCES 24 ++ ++#define UB_UNUSEDPRIVVM (UB_RESOURCES + 0) ++#define UB_TMPFSPAGES (UB_RESOURCES + 1) ++#define UB_SWAPPAGES (UB_RESOURCES + 2) ++#define UB_HELDPAGES (UB_RESOURCES + 3) ++ ++struct ubparm { ++ /* ++ * A barrier over which resource allocations are failed gracefully. ++ * If the amount of consumed memory is over the barrier further sbrk() ++ * or mmap() calls fail, the existing processes are not killed. ++ */ ++ unsigned long barrier; ++ /* hard resource limit */ ++ unsigned long limit; ++ /* consumed resources */ ++ unsigned long held; ++ /* maximum amount of consumed resources through the last period */ ++ unsigned long maxheld; ++ /* minimum amount of consumed resources through the last period */ ++ unsigned long minheld; ++ /* count of failed charges */ ++ unsigned long failcnt; ++}; ++ ++/* ++ * Kernel internal part. ++ */ ++ ++#ifdef __KERNEL__ ++ ++#include <ub/ub_debug.h> ++#include <linux/interrupt.h> ++#include <asm/atomic.h> ++#include <linux/spinlock.h> ++#include <linux/cache.h> ++#include <linux/threads.h> ++ ++/* ++ * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form. ++ */ ++#define UB_MAXVALUE ( (1UL << (sizeof(unsigned long)*8-1)) - 1) ++ ++ ++/* ++ * Resource management structures ++ * Serialization issues: ++ * beancounter list management is protected via ub_hash_lock ++ * task pointers are set only for current task and only once ++ * refcount is managed atomically ++ * value and limit comparison and change are protected by per-ub spinlock ++ */ ++ ++struct page_beancounter; ++struct task_beancounter; ++struct sock_beancounter; ++ ++struct page_private { ++ unsigned long ubp_unused_privvmpages; ++ unsigned long ubp_tmpfs_respages; ++ unsigned long ubp_swap_pages; ++ unsigned long long ubp_held_pages; ++}; ++ ++struct sock_private { ++ unsigned long ubp_rmem_thres; ++ unsigned long ubp_wmem_pressure; ++ unsigned long ubp_maxadvmss; ++ unsigned long ubp_rmem_pressure; ++#define UB_RMEM_EXPAND 0 ++#define UB_RMEM_KEEP 1 ++#define UB_RMEM_SHRINK 2 ++ struct list_head ubp_other_socks; ++ struct list_head ubp_tcp_socks; ++ atomic_t ubp_orphan_count; ++}; ++ ++struct ub_perfstat { ++ unsigned long unmap; ++ unsigned long swapin; ++ ++#ifdef CONFIG_UBC_DEBUG_KMEM ++ long pages_charged; ++ long vmalloc_charged; ++ long pbcs; ++#endif ++} ____cacheline_aligned_in_smp; ++ ++struct user_beancounter ++{ ++ unsigned long ub_magic; ++ atomic_t ub_refcount; ++ struct user_beancounter *ub_next; ++ spinlock_t ub_lock; ++ uid_t ub_uid; ++ ++ struct ub_rate_info ub_limit_rl; ++ int ub_oom_noproc; ++ ++ struct page_private ppriv; ++#define ub_unused_privvmpages ppriv.ubp_unused_privvmpages ++#define ub_tmpfs_respages ppriv.ubp_tmpfs_respages ++#define ub_swap_pages ppriv.ubp_swap_pages ++#define ub_held_pages ppriv.ubp_held_pages ++ struct sock_private spriv; ++#define ub_rmem_thres spriv.ubp_rmem_thres ++#define ub_maxadvmss spriv.ubp_maxadvmss ++#define ub_rmem_pressure spriv.ubp_rmem_pressure ++#define ub_wmem_pressure spriv.ubp_wmem_pressure ++#define ub_tcp_sk_list spriv.ubp_tcp_socks ++#define ub_other_sk_list spriv.ubp_other_socks ++#define ub_orphan_count spriv.ubp_orphan_count ++ ++ struct user_beancounter *parent; ++ void *private_data; ++ ++ /* resources statistic and settings */ ++ struct ubparm ub_parms[UB_RESOURCES]; ++ /* resources statistic for last interval */ ++ struct ubparm ub_store[UB_RESOURCES]; ++ ++ struct ub_perfstat ub_stat[NR_CPUS]; ++ ++#ifdef CONFIG_UBC_DEBUG_KMEM ++ struct list_head ub_cclist; ++#endif ++}; ++ ++enum severity { UB_HARD, UB_SOFT, UB_FORCE }; ++ ++static inline int ub_barrier_hit(struct user_beancounter *ub, int resource) ++{ ++ return ub->ub_parms[resource].held > ub->ub_parms[resource].barrier; ++} ++ ++static inline int ub_hfbarrier_hit(struct user_beancounter *ub, int resource) ++{ ++ return (ub->ub_parms[resource].held > ++ ((ub->ub_parms[resource].barrier) >> 1)); ++} ++ ++#ifndef CONFIG_USER_RESOURCE ++ ++extern inline struct user_beancounter *get_beancounter_byuid ++ (uid_t uid, int create) { return NULL; } ++extern inline struct user_beancounter *get_beancounter ++ (struct user_beancounter *ub) { return NULL; } ++extern inline void put_beancounter(struct user_beancounter *ub) {;} ++ ++static inline void ub_init_cache(unsigned long mempages) { }; ++static inline void ub_init_ub0(void) { }; ++ ++#define get_ub0() NULL ++ ++#else /* CONFIG_USER_RESOURCE */ ++ ++/* ++ * Charge/uncharge operations ++ */ ++ ++extern int __charge_beancounter_locked(struct user_beancounter *ub, ++ int resource, unsigned long val, enum severity strict); ++ ++extern void __uncharge_beancounter_locked(struct user_beancounter *ub, ++ int resource, unsigned long val); ++ ++extern void __put_beancounter(struct user_beancounter *ub); ++ ++extern void uncharge_warn(struct user_beancounter *ub, int resource, ++ unsigned long val, unsigned long held); ++ ++extern const char *ub_rnames[]; ++/* ++ * Put a beancounter reference ++ */ ++ ++static inline void put_beancounter(struct user_beancounter *ub) ++{ ++ if (unlikely(ub == NULL)) ++ return; ++ ++ __put_beancounter(ub); ++} ++ ++/* ++ * Create a new beancounter reference ++ */ ++extern struct user_beancounter *get_beancounter_byuid(uid_t uid, int create); ++ ++static inline ++struct user_beancounter *get_beancounter(struct user_beancounter *ub) ++{ ++ if (unlikely(ub == NULL)) ++ return NULL; ++ ++ atomic_inc(&ub->ub_refcount); ++ return ub; ++} ++ ++extern struct user_beancounter *get_subbeancounter_byid( ++ struct user_beancounter *, ++ int id, int create); ++extern struct user_beancounter *subbeancounter_findcreate( ++ struct user_beancounter *p, int id); ++ ++extern struct user_beancounter ub0; ++ ++extern void ub_init_cache(unsigned long); ++extern void ub_init_ub0(void); ++#define get_ub0() (&ub0) ++ ++extern void print_ub_uid(struct user_beancounter *ub, char *buf, int size); ++ ++/* ++ * Resource charging ++ * Change user's account and compare against limits ++ */ ++ ++static inline void ub_adjust_maxheld(struct user_beancounter *ub, int resource) ++{ ++ if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held) ++ ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held; ++ if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held) ++ ub->ub_parms[resource].minheld = ub->ub_parms[resource].held; ++} ++ ++#endif /* CONFIG_USER_RESOURCE */ ++ ++#include <ub/ub_decl.h> ++UB_DECLARE_FUNC(int, charge_beancounter(struct user_beancounter *ub, ++ int resource, unsigned long val, enum severity strict)); ++UB_DECLARE_VOID_FUNC(uncharge_beancounter(struct user_beancounter *ub, ++ int resource, unsigned long val)); ++ ++UB_DECLARE_VOID_FUNC(charge_beancounter_notop(struct user_beancounter *ub, ++ int resource, unsigned long val)); ++UB_DECLARE_VOID_FUNC(uncharge_beancounter_notop(struct user_beancounter *ub, ++ int resource, unsigned long val)); ++ ++#ifndef CONFIG_USER_RESOURCE_PROC ++static inline void ub_init_proc(void) { }; ++#else ++extern void ub_init_proc(void); ++#endif ++ ++#ifdef CONFIG_USER_RSS_ACCOUNTING ++extern void ub_init_pbc(void); ++#else ++static inline void ub_ini_pbc(void) { } ++#endif ++#endif /* __KERNEL__ */ ++#endif /* _LINUX_BEANCOUNTER_H */ +diff -uprN linux-2.6.15.orig/include/ub/ub_dcache.h linux-2.6.15-ve025stab014/include/ub/ub_dcache.h +--- linux-2.6.15.orig/include/ub/ub_dcache.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/ub/ub_dcache.h 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,57 @@ ++/* ++ * include/ub/ub_dcache.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_DCACHE_H_ ++#define __UB_DCACHE_H_ ++ ++#include <ub/ub_decl.h> ++ ++/* ++ * UB_DCACHESIZE accounting ++ */ ++ ++struct dentry_beancounter ++{ ++ /* ++ * d_inuse = ++ * <number of external refs> + ++ * <number of 'used' childs> ++ * ++ * d_inuse == -1 means that dentry is unused ++ * state change -1 => 0 causes charge ++ * state change 0 => -1 causes uncharge ++ */ ++ atomic_t d_inuse; ++ /* charged size, including name length if name is not inline */ ++ unsigned long d_ubsize; ++ struct user_beancounter *d_ub; ++}; ++ ++struct dentry; ++ ++UB_DECLARE_FUNC(int, ub_dentry_alloc(struct dentry *d)) ++UB_DECLARE_VOID_FUNC(ub_dentry_charge_nofail(struct dentry *d)) ++UB_DECLARE_VOID_FUNC(ub_dentry_uncharge(struct dentry *d)) ++ ++#ifdef CONFIG_USER_RESOURCE ++UB_DECLARE_FUNC(int, ub_dentry_charge(struct dentry *d)) ++#define ub_dget_testone(d) (atomic_inc_and_test(&(d)->dentry_bc.d_inuse)) ++#define ub_dput_testzero(d) (atomic_add_negative(-1, &(d)->dentry_bc.d_inuse)) ++#define INUSE_INIT 0 ++#else ++#define ub_dentry_charge(d) ({ \ ++ spin_unlock(&d->d_lock); \ ++ rcu_read_unlock(); \ ++ 0; \ ++ }) ++#define ub_dget_testone(d) (0) ++#define ub_dput_testzero(d) (0) ++#endif ++#endif +diff -uprN linux-2.6.15.orig/include/ub/ub_debug.h linux-2.6.15-ve025stab014/include/ub/ub_debug.h +--- linux-2.6.15.orig/include/ub/ub_debug.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/ub/ub_debug.h 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,95 @@ ++/* ++ * include/ub/ub_debug.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_DEBUG_H_ ++#define __UB_DEBUG_H_ ++ ++/* ++ * general debugging ++ */ ++ ++#define UBD_ALLOC 0x1 ++#define UBD_CHARGE 0x2 ++#define UBD_LIMIT 0x4 ++#define UBD_TRACE 0x8 ++ ++/* ++ * ub_net debugging ++ */ ++ ++#define UBD_NET_SOCKET 0x10 ++#define UBD_NET_SLEEP 0x20 ++#define UBD_NET_SEND 0x40 ++#define UBD_NET_RECV 0x80 ++ ++/* ++ * Main routines ++ */ ++ ++#define UB_DEBUG (0) ++#define DEBUG_RESOURCE (0ULL) ++ ++#define ub_dbg_cond(__cond, __str, args...) \ ++ do { \ ++ if ((__cond) != 0) \ ++ printk(__str, ##args); \ ++ } while(0) ++ ++#define ub_debug(__section, __str, args...) \ ++ ub_dbg_cond(UB_DEBUG & (__section), __str, ##args) ++ ++#define ub_debug_resource(__resource, __str, args...) \ ++ ub_dbg_cond((UB_DEBUG & UBD_CHARGE) && \ ++ (DEBUG_RESOURCE & (1 << (__resource))), \ ++ __str, ##args) ++ ++#if UB_DEBUG & UBD_TRACE ++#define ub_debug_trace(__cond, __b, __r) \ ++ do { \ ++ static struct ub_rate_info ri = { __b, __r }; \ ++ if ((__cond) != 0 && ub_ratelimit(&ri)) \ ++ dump_stack(); \ ++ } while(0) ++#else ++#define ub_debug_trace(__cond, __burst, __rate) ++#endif ++ ++#include <linux/config.h> ++ ++#ifdef CONFIG_UBC_DEBUG_KMEM ++#include <linux/list.h> ++#include <linux/kmem_cache.h> ++ ++struct user_beancounter; ++struct ub_cache_counter { ++ struct list_head ulist; ++ struct ub_cache_counter *next; ++ struct user_beancounter *ub; ++ kmem_cache_t *cachep; ++ unsigned long counter; ++}; ++ ++extern spinlock_t cc_lock; ++extern void init_cache_counters(void); ++extern void ub_free_counters(struct user_beancounter *); ++extern void ub_kmemcache_free(kmem_cache_t *cachep); ++ ++struct vm_struct; ++extern void inc_vmalloc_charged(struct vm_struct *, int); ++extern void dec_vmalloc_charged(struct vm_struct *); ++#else ++#define init_cache_counters() do { } while (0) ++#define inc_vmalloc_charged(vm, f) do { } while (0) ++#define dec_vmalloc_charged(vm) do { } while (0) ++#define ub_free_counters(ub) do { } while (0) ++#define ub_kmemcache_free(cachep) do { } while (0) ++#endif ++ ++#endif +diff -uprN linux-2.6.15.orig/include/ub/ub_decl.h linux-2.6.15-ve025stab014/include/ub/ub_decl.h +--- linux-2.6.15.orig/include/ub/ub_decl.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/ub/ub_decl.h 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,40 @@ ++/* ++ * include/ub/ub_decl.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_DECL_H_ ++#define __UB_DECL_H_ ++ ++#include <linux/config.h> ++ ++/* ++ * Naming convension: ++ * ub_<section|object>_<operation> ++ */ ++ ++#ifdef CONFIG_USER_RESOURCE ++ ++#define UB_DECLARE_FUNC(ret_type, decl) extern ret_type decl; ++#define UB_DECLARE_VOID_FUNC(decl) extern void decl; ++ ++#else /* CONFIG_USER_RESOURCE */ ++ ++#define UB_DECLARE_FUNC(ret_type, decl) \ ++ static inline ret_type decl \ ++ { \ ++ return (ret_type)0; \ ++ } ++#define UB_DECLARE_VOID_FUNC(decl) \ ++ static inline void decl \ ++ { \ ++ } ++ ++#endif /* CONFIG_USER_RESOURCE */ ++ ++#endif +diff -uprN linux-2.6.15.orig/include/ub/ub_hash.h linux-2.6.15-ve025stab014/include/ub/ub_hash.h +--- linux-2.6.15.orig/include/ub/ub_hash.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/ub/ub_hash.h 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,41 @@ ++/* ++ * include/ub/ub_hash.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _LINUX_UBHASH_H ++#define _LINUX_UBHASH_H ++ ++#ifdef __KERNEL__ ++ ++#define UB_HASH_SIZE 256 ++ ++struct ub_hash_slot { ++ struct user_beancounter *ubh_beans; ++}; ++ ++extern struct ub_hash_slot ub_hash[]; ++extern spinlock_t ub_hash_lock; ++ ++#ifdef CONFIG_USER_RESOURCE ++ ++/* ++ * Iterate over beancounters ++ * @__slot - hash slot ++ * @__ubp - beancounter ptr ++ * Can use break :) ++ */ ++#define for_each_beancounter(__slot, __ubp) \ ++ for (__slot = 0, __ubp = NULL; \ ++ __slot < UB_HASH_SIZE && __ubp == NULL; __slot++) \ ++ for (__ubp = ub_hash[__slot].ubh_beans; __ubp; \ ++ __ubp = __ubp->ub_next) ++ ++#endif /* CONFIG_USER_RESOURCE */ ++#endif /* __KERNEL__ */ ++#endif /* _LINUX_UBHASH_H */ +diff -uprN linux-2.6.15.orig/include/ub/ub_mem.h linux-2.6.15-ve025stab014/include/ub/ub_mem.h +--- linux-2.6.15.orig/include/ub/ub_mem.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/ub/ub_mem.h 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,82 @@ ++/* ++ * include/ub/ub_mem.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_SLAB_H_ ++#define __UB_SLAB_H_ ++ ++#include <linux/config.h> ++#include <linux/kmem_slab.h> ++#include <ub/beancounter.h> ++#include <ub/ub_decl.h> ++ ++/* ++ * UB_KMEMSIZE accounting ++ */ ++ ++#ifdef CONFIG_UBC_DEBUG_ITEMS ++#define CHARGE_ORDER(__o) (1 << __o) ++#define CHARGE_SIZE(__s) 1 ++#else ++#define CHARGE_ORDER(__o) (PAGE_SIZE << (__o)) ++#define CHARGE_SIZE(__s) (__s) ++#endif ++ ++#define page_ub(__page) ((__page)->bc.page_ub) ++ ++struct mm_struct; ++struct page; ++ ++UB_DECLARE_FUNC(struct user_beancounter *, slab_ub(void *obj)) ++UB_DECLARE_FUNC(struct user_beancounter *, vmalloc_ub(void *obj)) ++UB_DECLARE_FUNC(struct user_beancounter *, mem_ub(void *obj)) ++ ++UB_DECLARE_FUNC(int, ub_page_charge(struct page *page, int order, int mask)) ++UB_DECLARE_VOID_FUNC(ub_page_uncharge(struct page *page, int order)) ++UB_DECLARE_FUNC(int, ub_slab_charge(void *objp, int flags)) ++UB_DECLARE_VOID_FUNC(ub_slab_uncharge(void *obj)) ++ ++#define slab_ubcs(cachep, slabp) ((struct user_beancounter **)\ ++ (ALIGN((unsigned long)(slab_bufctl(slabp) + (cachep)->num),\ ++ sizeof(void *)))) ++ ++#ifdef CONFIG_USER_RESOURCE ++/* Flags without __GFP_UBC must comply with vmalloc */ ++#define ub_vmalloc(size) __vmalloc(size, \ ++ GFP_KERNEL | __GFP_HIGHMEM | __GFP_UBC, PAGE_KERNEL) ++#define ub_kmalloc(size, flags) kmalloc(size, ((flags) | __GFP_UBC)) ++extern struct user_beancounter *ub_select_worst(long *); ++ ++/* mm/slab.c needed stuff */ ++#define UB_ALIGN(flags) (flags & SLAB_UBC ? sizeof(void *) : 1) ++#define UB_EXTRA(flags) (flags & SLAB_UBC ? sizeof(void *) : 0) ++#define set_cache_objuse(cachep) do { \ ++ (cachep)->objuse = ((PAGE_SIZE << (cachep)->gfporder) + \ ++ (cachep)->num - 1) / (cachep)->num; \ ++ if (!OFF_SLAB(cachep)) \ ++ break; \ ++ (cachep)->objuse += ((cachep)->slabp_cache->objuse + \ ++ (cachep)->num - 1) / (cachep)->num; \ ++ } while (0) ++#define init_slab_ubps(cachep, slabp) do { \ ++ if (!((cachep)->flags & SLAB_UBC)) \ ++ break; \ ++ memset(slab_ubcs(cachep, slabp), 0, \ ++ (cachep)->num * sizeof(void *)); \ ++ } while (0) ++#define kmem_obj_memusage(o) (GET_PAGE_CACHE(virt_to_page(o))->objuse) ++#else ++#define ub_vmalloc(size) vmalloc(size) ++#define ub_kmalloc(size, flags) kmalloc(size, flags) ++#define UB_ALIGN(flags) 1 ++#define UB_EXTRA(flags) 0 ++#define set_cache_objuse(c) do { } while (0) ++#define init_slab_ubps(c, s) do { } while (0) ++#endif ++#endif /* __UB_SLAB_H_ */ +diff -uprN linux-2.6.15.orig/include/ub/ub_misc.h linux-2.6.15-ve025stab014/include/ub/ub_misc.h +--- linux-2.6.15.orig/include/ub/ub_misc.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/ub/ub_misc.h 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,49 @@ ++/* ++ * include/ub/ub_misc.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_MISC_H_ ++#define __UB_MISC_H_ ++ ++#include <ub/ub_decl.h> ++ ++struct tty_struct; ++struct file; ++struct file_lock; ++struct sigqueue; ++ ++UB_DECLARE_FUNC(int, ub_file_charge(struct file *f)) ++UB_DECLARE_VOID_FUNC(ub_file_uncharge(struct file *f)) ++UB_DECLARE_FUNC(int, ub_flock_charge(struct file_lock *fl, int hard)) ++UB_DECLARE_VOID_FUNC(ub_flock_uncharge(struct file_lock *fl)) ++UB_DECLARE_FUNC(int, ub_siginfo_charge(struct sigqueue *q, ++ struct user_beancounter *ub)) ++UB_DECLARE_VOID_FUNC(ub_siginfo_uncharge(struct sigqueue *q)) ++UB_DECLARE_FUNC(int, ub_task_charge(struct task_struct *parent, ++ struct task_struct *task)) ++UB_DECLARE_VOID_FUNC(ub_task_uncharge(struct task_struct *task)) ++UB_DECLARE_FUNC(int, ub_pty_charge(struct tty_struct *tty)) ++UB_DECLARE_VOID_FUNC(ub_pty_uncharge(struct tty_struct *tty)) ++ ++#ifdef CONFIG_USER_RESOURCE ++#define set_flock_charged(fl) do { (fl)->fl_charged = 1; } while (0) ++#define set_mm_ub(mm, tsk) do { \ ++ (mm)->mm_ub = get_beancounter(tsk ? \ ++ tsk->task_bc.task_ub : get_exec_ub()); \ ++ } while (0) ++#define put_mm_ub(mm) do { \ ++ put_beancounter((mm)->mm_ub); \ ++ (mm)->mm_ub = NULL; \ ++ } while (0) ++#else ++#define set_flock_charged(fl) do { } while (0) ++#define set_mm_ub(mm, tsk) do { } while (0) ++#define put_mm_ub(mm) do { } while (0) ++#endif ++#endif +diff -uprN linux-2.6.15.orig/include/ub/ub_net.h linux-2.6.15-ve025stab014/include/ub/ub_net.h +--- linux-2.6.15.orig/include/ub/ub_net.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/ub/ub_net.h 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,141 @@ ++/* ++ * include/ub/ub_net.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_NET_H_ ++#define __UB_NET_H_ ++ ++/* ++ * UB_NUMXXXSOCK, UB_XXXBUF accounting ++ */ ++ ++#include <ub/ub_decl.h> ++#include <ub/ub_sk.h> ++ ++#define bid2sid(__bufid) \ ++ ((__bufid) == UB_TCPSNDBUF ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK) ++ ++#define SOCK_MIN_UBCSPACE ((int)((2048 - sizeof(struct skb_shared_info)) & \ ++ ~(SMP_CACHE_BYTES-1))) ++#define SOCK_MIN_UBCSPACE_CH skb_charge_size(SOCK_MIN_UBCSPACE) ++ ++ ++#define IS_TCP_SOCK(__family, __type) \ ++ ((__family) == PF_INET && (__type) == SOCK_STREAM) ++ ++UB_DECLARE_FUNC(int, ub_sock_charge(struct sock *sk, int family, int type)) ++UB_DECLARE_FUNC(int, ub_tcp_sock_charge(struct sock *sk)) ++UB_DECLARE_FUNC(int, ub_other_sock_charge(struct sock *sk)) ++UB_DECLARE_VOID_FUNC(ub_sock_uncharge(struct sock *sk)) ++UB_DECLARE_VOID_FUNC(ub_skb_uncharge(struct sk_buff *skb)) ++UB_DECLARE_FUNC(int, ub_skb_alloc_bc(struct sk_buff *skb, int gfp_mask)) ++UB_DECLARE_VOID_FUNC(ub_skb_free_bc(struct sk_buff *skb)) ++UB_DECLARE_FUNC(int, ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk)) ++UB_DECLARE_FUNC(int, ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb)) ++UB_DECLARE_VOID_FUNC(ub_sock_snd_queue_add(struct sock *sk, int resource, ++ unsigned long size)) ++UB_DECLARE_FUNC(long, ub_sock_wait_for_space(struct sock *sk, long timeo, ++ unsigned long size)) ++ ++UB_DECLARE_FUNC(int, ub_tcprcvbuf_charge(struct sock *sk, struct sk_buff *skb)) ++UB_DECLARE_FUNC(int, ub_tcprcvbuf_charge_forced(struct sock *sk, ++ struct sk_buff *skb)) ++UB_DECLARE_FUNC(int, ub_tcpsndbuf_charge(struct sock *sk, struct sk_buff *skb)) ++UB_DECLARE_FUNC(int, ub_tcpsndbuf_charge_forced(struct sock *sk, ++ struct sk_buff *skb)) ++ ++/* Charge size */ ++static inline unsigned long skb_charge_datalen(unsigned long chargesize) ++{ ++#ifdef CONFIG_USER_RESOURCE ++ unsigned long slabsize; ++ ++ chargesize -= sizeof(struct sk_buff); ++ slabsize = 64; ++ do { ++ slabsize <<= 1; ++ } while (slabsize <= chargesize); ++ ++ slabsize >>= 1; ++ return (slabsize - sizeof(struct skb_shared_info)) & ++ ~(SMP_CACHE_BYTES-1); ++#else ++ return 0; ++#endif ++} ++ ++static inline unsigned long skb_charge_size_gen(unsigned long size) ++{ ++#ifdef CONFIG_USER_RESOURCE ++ unsigned int slabsize; ++ ++ size = SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info); ++ slabsize = 32; /* min size is 64 because of skb_shared_info */ ++ do { ++ slabsize <<= 1; ++ } while (slabsize < size); ++ ++ return slabsize + sizeof(struct sk_buff); ++#else ++ return 0; ++#endif ++ ++} ++ ++static inline unsigned long skb_charge_size_const(unsigned long size) ++{ ++#ifdef CONFIG_USER_RESOURCE ++ unsigned int ret; ++ if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 64) ++ ret = 64 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 128) ++ ret = 128 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 256) ++ ret = 256 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 512) ++ ret = 512 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 1024) ++ ret = 1024 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 2048) ++ ret = 2048 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 4096) ++ ret = 4096 + sizeof(struct sk_buff); ++ else ++ ret = skb_charge_size_gen(size); ++ return ret; ++#else ++ return 0; ++#endif ++} ++ ++ ++#define skb_charge_size(__size) \ ++ (__builtin_constant_p(__size) ? \ ++ skb_charge_size_const(__size) : \ ++ skb_charge_size_gen(__size)) ++ ++UB_DECLARE_FUNC(int, skb_charge_fullsize(struct sk_buff *skb)) ++UB_DECLARE_VOID_FUNC(ub_skb_set_charge(struct sk_buff *skb, ++ struct sock *sk, unsigned long size, int res)) ++ ++/* Poll reserv */ ++UB_DECLARE_FUNC(int, ub_sock_makewres_other(struct sock *sk, unsigned long sz)) ++UB_DECLARE_FUNC(int, ub_sock_makewres_tcp(struct sock *sk, unsigned long size)) ++UB_DECLARE_FUNC(int, ub_sock_getwres_other(struct sock *sk, unsigned long size)) ++UB_DECLARE_FUNC(int, ub_sock_getwres_tcp(struct sock *sk, unsigned long size)) ++UB_DECLARE_VOID_FUNC(ub_sock_retwres_other(struct sock *sk, unsigned long size, ++ unsigned long ressize)) ++UB_DECLARE_VOID_FUNC(ub_sock_retwres_tcp(struct sock *sk, unsigned long size, ++ unsigned long ressize)) ++UB_DECLARE_VOID_FUNC(ub_sock_sndqueueadd_other(struct sock *sk, ++ unsigned long size)) ++UB_DECLARE_VOID_FUNC(ub_sock_sndqueueadd_tcp(struct sock *sk, unsigned long sz)) ++UB_DECLARE_VOID_FUNC(ub_sock_sndqueuedel(struct sock *sk)) ++ ++#endif +diff -uprN linux-2.6.15.orig/include/ub/ub_oom.h linux-2.6.15-ve025stab014/include/ub/ub_oom.h +--- linux-2.6.15.orig/include/ub/ub_oom.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/ub/ub_oom.h 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,45 @@ ++#ifndef __UB_OOM_H_ ++#define __UB_OOM_H_ ++ ++#include <ub/ub_decl.h> ++ ++/* oom killer */ ++UB_DECLARE_VOID_FUNC(ub_oom_init(void)) ++UB_DECLARE_FUNC(int, ub_oom_start(void)) ++UB_DECLARE_FUNC(struct user_beancounter *, ub_oom_select_worst(void)) ++UB_DECLARE_VOID_FUNC(ub_oom_kill_task(struct task_struct *tsk, ++ struct mm_struct *mm)) ++UB_DECLARE_VOID_FUNC(ub_oom_stop(void)) ++UB_DECLARE_VOID_FUNC(ub_oom_task_exit(struct task_struct *tsk)) ++ ++#ifdef CONFIG_USER_RESOURCE ++#define ub_oom_task_match(t, ub) (((ub) == NULL) || \ ++ ((t)->mm != NULL && (t)->mm->mm_ub == (ub))) ++#define ub_oom_panic(ub) ((ub) == NULL) ++#else ++#define ub_oom_task_match(t, ub) 1 ++#define ub_oom_panic(ub) 1 ++#endif ++ ++/* vmscan stuff */ ++struct scan_control; ++struct oom_freeing_stat { ++ unsigned long free; ++ unsigned long swap; /* page referrence counters removed */ ++ unsigned long write; /* IO started */ ++ unsigned long slab; /* slabs shrinked */ ++}; ++ ++#ifdef CONFIG_USER_RESOURCE ++#define ub_oom_inc(sc, res, nr) do { (sc)->oom_stat.res += nr; } while (0) ++#define ub_oom_did_progress(sc) (nr_swap_pages && ( \ ++ (sc)->oom_stat.slab || \ ++ (sc)->oom_stat.swap || \ ++ (sc)->oom_stat.write || \ ++ (sc)->oom_stat.free \ ++ )) ++#else ++#define ub_oom_inc(sc, res, nr) do { } while (0) ++#define ub_oom_did_progress(sc) 0 ++#endif ++#endif +diff -uprN linux-2.6.15.orig/include/ub/ub_orphan.h linux-2.6.15-ve025stab014/include/ub/ub_orphan.h +--- linux-2.6.15.orig/include/ub/ub_orphan.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/ub/ub_orphan.h 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,56 @@ ++/* ++ * include/ub/ub_orphan.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_ORPHAN_H_ ++#define __UB_ORPHAN_H_ ++ ++#include <net/tcp.h> ++ ++#include "ub/beancounter.h" ++#include "ub/ub_net.h" ++ ++ ++static inline atomic_t *__ub_get_orphan_count_ptr(struct sock *sk) ++{ ++#ifdef CONFIG_USER_RESOURCE ++ if (sock_has_ubc(sk)) ++ return &sock_bc(sk)->ub->ub_orphan_count; ++#endif ++ return sk->sk_prot->orphan_count; ++} ++ ++static inline void ub_inc_orphan_count(struct sock *sk) ++{ ++ atomic_inc(__ub_get_orphan_count_ptr(sk)); ++} ++ ++static inline void ub_dec_orphan_count(struct sock *sk) ++{ ++ atomic_dec(__ub_get_orphan_count_ptr(sk)); ++} ++ ++static inline int ub_get_orphan_count(struct sock *sk) ++{ ++ return atomic_read(__ub_get_orphan_count_ptr(sk)); ++} ++ ++extern int __ub_too_many_orphans(struct sock *sk, int count); ++static inline int ub_too_many_orphans(struct sock *sk, int count) ++{ ++#ifdef CONFIG_USER_RESOURCE ++ if (__ub_too_many_orphans(sk, count)) ++ return 1; ++#endif ++ return (ub_get_orphan_count(sk) > sysctl_tcp_max_orphans || ++ (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && ++ atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])); ++} ++ ++#endif +diff -uprN linux-2.6.15.orig/include/ub/ub_page.h linux-2.6.15-ve025stab014/include/ub/ub_page.h +--- linux-2.6.15.orig/include/ub/ub_page.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/ub/ub_page.h 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,48 @@ ++/* ++ * include/ub/ub_page.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_PAGE_H_ ++#define __UB_PAGE_H_ ++ ++#include <linux/config.h> ++ ++/* ++ * Page_beancounters ++ */ ++ ++struct page; ++struct user_beancounter; ++ ++#define PB_MAGIC 0x62700001UL ++ ++struct page_beancounter { ++ unsigned long pb_magic; ++ struct page *page; ++ struct user_beancounter *ub; ++ struct page_beancounter *next_hash; ++ unsigned refcount; ++ struct list_head page_list; ++}; ++ ++#define PB_REFCOUNT_BITS 24 ++#define PB_SHIFT_GET(c) ((c) >> PB_REFCOUNT_BITS) ++#define PB_SHIFT_INC(c) ((c) += (1 << PB_REFCOUNT_BITS)) ++#define PB_SHIFT_DEC(c) ((c) -= (1 << PB_REFCOUNT_BITS)) ++#define PB_COUNT_GET(c) ((c) & ((1 << PB_REFCOUNT_BITS) - 1)) ++#define PB_COUNT_INC(c) ((c)++) ++#define PB_COUNT_DEC(c) ((c)--) ++#define PB_REFCOUNT_MAKE(s, c) (((s) << PB_REFCOUNT_BITS) + (c)) ++ ++#define page_pbc(__page) ((__page)->bc.page_pb) ++ ++struct address_space; ++extern int is_shmem_mapping(struct address_space *); ++ ++#endif +diff -uprN linux-2.6.15.orig/include/ub/ub_sk.h linux-2.6.15-ve025stab014/include/ub/ub_sk.h +--- linux-2.6.15.orig/include/ub/ub_sk.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/ub/ub_sk.h 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,43 @@ ++/* ++ * include/ub/ub_sk.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_SK_H_ ++#define __UB_SK_H_ ++ ++#include <linux/config.h> ++#include <ub/ub_task.h> ++ ++struct sock; ++struct sk_buff; ++ ++struct skb_beancounter { ++ struct user_beancounter *ub; ++ unsigned long charged:27, resource:5; ++}; ++ ++struct sock_beancounter { ++ /* ++ * already charged for future sends, to make poll work; ++ * changes are protected by bc spinlock, read is under socket ++ * semaphore for sends and unprotected in poll ++ */ ++ unsigned long poll_reserv; ++ unsigned long ub_waitspc; /* space waiting for */ ++ unsigned long ub_wcharged; ++ struct list_head ub_sock_list; ++ struct user_beancounter *ub; ++}; ++ ++#define sock_bc(__sk) (&(__sk)->sk_bc) ++#define skb_bc(__skb) (&(__skb)->skb_bc) ++#define skbc_sock(__skbc) (container_of(__skbc, struct sock, sk_bc)) ++#define sock_has_ubc(__sk) (sock_bc(__sk)->ub != NULL) ++ ++#endif +diff -uprN linux-2.6.15.orig/include/ub/ub_stat.h linux-2.6.15-ve025stab014/include/ub/ub_stat.h +--- linux-2.6.15.orig/include/ub/ub_stat.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/ub/ub_stat.h 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,70 @@ ++/* ++ * include/ub/ub_stat.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_STAT_H_ ++#define __UB_STAT_H_ ++ ++/* sys_ubstat commands list */ ++#define UBSTAT_READ_ONE 0x010000 ++#define UBSTAT_READ_ALL 0x020000 ++#define UBSTAT_READ_FULL 0x030000 ++#define UBSTAT_UBLIST 0x040000 ++#define UBSTAT_UBPARMNUM 0x050000 ++#define UBSTAT_GETTIME 0x060000 ++ ++#define UBSTAT_CMD(func) ((func) & 0xF0000) ++#define UBSTAT_PARMID(func) ((func) & 0x0FFFF) ++ ++#define TIME_MAX_SEC (LONG_MAX / HZ) ++#define TIME_MAX_JIF (TIME_MAX_SEC * HZ) ++ ++typedef unsigned long ubstattime_t; ++ ++typedef struct { ++ ubstattime_t start_time; ++ ubstattime_t end_time; ++ ubstattime_t cur_time; ++} ubgettime_t; ++ ++typedef struct { ++ long maxinterval; ++ int signum; ++} ubnotifrq_t; ++ ++typedef struct { ++ unsigned long maxheld; ++ unsigned long failcnt; ++} ubstatparm_t; ++ ++typedef struct { ++ unsigned long barrier; ++ unsigned long limit; ++ unsigned long held; ++ unsigned long maxheld; ++ unsigned long minheld; ++ unsigned long failcnt; ++ unsigned long __unused1; ++ unsigned long __unused2; ++} ubstatparmf_t; ++ ++typedef struct { ++ ubstattime_t start_time; ++ ubstattime_t end_time; ++ ubstatparmf_t param[0]; ++} ubstatfull_t; ++ ++#ifdef __KERNEL__ ++struct ub_stat_notify { ++ struct list_head list; ++ struct task_struct *task; ++ int signum; ++}; ++#endif ++#endif +diff -uprN linux-2.6.15.orig/include/ub/ub_task.h linux-2.6.15-ve025stab014/include/ub/ub_task.h +--- linux-2.6.15.orig/include/ub/ub_task.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/ub/ub_task.h 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,49 @@ ++/* ++ * include/ub/ub_task.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_TASK_H_ ++#define __UB_TASK_H_ ++ ++#include <linux/config.h> ++ ++struct user_beancounter; ++ ++ ++#ifdef CONFIG_USER_RESOURCE ++ ++struct task_beancounter { ++ struct user_beancounter *exec_ub; ++ struct user_beancounter *task_ub; ++ struct user_beancounter *fork_sub; ++ void *task_fnode, *task_freserv; ++ unsigned long oom_generation; ++ unsigned long task_data[4]; ++}; ++ ++#define get_exec_ub() (current->task_bc.exec_ub) ++#define get_task_ub(__task) ((__task)->task_bc.task_ub) ++#define set_exec_ub(__newub) \ ++({ \ ++ struct user_beancounter *old; \ ++ struct task_beancounter *tbc; \ ++ tbc = ¤t->task_bc; \ ++ old = tbc->exec_ub; \ ++ tbc->exec_ub = __newub; \ ++ old; \ ++}) ++ ++#else /* CONFIG_USER_RESOURCE */ ++ ++#define get_exec_ub() (NULL) ++#define get_task_ub(task) (NULL) ++#define set_exec_ub(__ub) (NULL) ++ ++#endif /* CONFIG_USER_RESOURCE */ ++#endif /* __UB_TASK_H_ */ +diff -uprN linux-2.6.15.orig/include/ub/ub_tcp.h linux-2.6.15-ve025stab014/include/ub/ub_tcp.h +--- linux-2.6.15.orig/include/ub/ub_tcp.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/ub/ub_tcp.h 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,79 @@ ++/* ++ * include/ub/ub_tcp.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_TCP_H_ ++#define __UB_TCP_H_ ++ ++/* ++ * UB_NUMXXXSOCK, UB_XXXBUF accounting ++ */ ++ ++#include <ub/ub_sk.h> ++#include <ub/beancounter.h> ++ ++static inline void ub_tcp_update_maxadvmss(struct sock *sk) ++{ ++#ifdef CONFIG_USER_RESOURCE ++ if (!sock_has_ubc(sk)) ++ return; ++ if (sock_bc(sk)->ub->ub_maxadvmss >= tcp_sk(sk)->advmss) ++ return; ++ ++ sock_bc(sk)->ub->ub_maxadvmss = ++ skb_charge_size(MAX_HEADER + sizeof(struct iphdr) ++ + sizeof(struct tcphdr) + tcp_sk(sk)->advmss); ++#endif ++} ++ ++static inline int ub_tcp_rmem_allows_expand(struct sock *sk) ++{ ++ if (tcp_memory_pressure) ++ return 0; ++#ifdef CONFIG_USER_RESOURCE ++ if (sock_has_ubc(sk)) { ++ struct user_beancounter *ub; ++ ++ ub = sock_bc(sk)->ub; ++ if (ub->ub_rmem_pressure == UB_RMEM_EXPAND) ++ return 1; ++ if (ub->ub_rmem_pressure == UB_RMEM_SHRINK) ++ return 0; ++ return sk->sk_rcvbuf <= ub->ub_rmem_thres; ++ } ++#endif ++ return 1; ++} ++ ++static inline int ub_tcp_memory_pressure(struct sock *sk) ++{ ++ if (tcp_memory_pressure) ++ return 1; ++#ifdef CONFIG_USER_RESOURCE ++ if (sock_has_ubc(sk)) ++ return sock_bc(sk)->ub->ub_rmem_pressure != UB_RMEM_EXPAND; ++#endif ++ return 0; ++} ++ ++static inline int ub_tcp_shrink_rcvbuf(struct sock *sk) ++{ ++ if (tcp_memory_pressure) ++ return 1; ++#ifdef CONFIG_USER_RESOURCE ++ if (sock_has_ubc(sk)) ++ return sock_bc(sk)->ub->ub_rmem_pressure == UB_RMEM_SHRINK; ++#endif ++ return 0; ++} ++ ++UB_DECLARE_FUNC(int, ub_sock_tcp_chargepage(struct sock *sk)) ++UB_DECLARE_VOID_FUNC(ub_sock_tcp_detachpage(struct sock *sk)) ++ ++#endif +diff -uprN linux-2.6.15.orig/include/ub/ub_vmpages.h linux-2.6.15-ve025stab014/include/ub/ub_vmpages.h +--- linux-2.6.15.orig/include/ub/ub_vmpages.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/include/ub/ub_vmpages.h 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,165 @@ ++/* ++ * include/ub/ub_vmpages.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_PAGES_H_ ++#define __UB_PAGES_H_ ++ ++#include <linux/linkage.h> ++#include <linux/config.h> ++#include <ub/beancounter.h> ++#include <ub/ub_decl.h> ++ ++/* ++ * Check whether vma has private or copy-on-write mapping. ++ * Should match checks in ub_protected_charge(). ++ */ ++#define VM_UB_PRIVATE(__flags, __file) \ ++ ( ((__flags) & VM_WRITE) ? \ ++ (__file) == NULL || !((__flags) & VM_SHARED) : \ ++ 0 \ ++ ) ++ ++/* Mprotect charging result */ ++#define PRIVVM_ERROR -1 ++#define PRIVVM_NO_CHARGE 0 /* UB_DECLARE_FUNC retval with ubc off */ ++#define PRIVVM_TO_PRIVATE 1 ++#define PRIVVM_TO_SHARED 2 ++ ++UB_DECLARE_FUNC(int, ub_protected_charge(struct mm_struct *mm, ++ unsigned long size, ++ unsigned long newflags, ++ struct vm_area_struct *vma)) ++ ++UB_DECLARE_VOID_FUNC(ub_unused_privvm_add(struct mm_struct *mm, ++ struct vm_area_struct *vma, ++ unsigned long num)) ++#define ub_unused_privvm_inc(mm, vma) ub_unused_privvm_add(mm, vma, 1) ++UB_DECLARE_VOID_FUNC(ub_unused_privvm_sub(struct mm_struct *mm, ++ struct vm_area_struct *vma, ++ unsigned long num)) ++#define ub_unused_privvm_dec(mm, vma) ub_unused_privvm_sub(mm, vma, 1) ++ ++UB_DECLARE_VOID_FUNC(__ub_unused_privvm_dec(struct mm_struct *mm, ++ long sz)) ++ ++UB_DECLARE_FUNC(int, ub_memory_charge(struct mm_struct *mm, ++ unsigned long size, ++ unsigned vm_flags, ++ struct file *vm_file, ++ int strict)) ++UB_DECLARE_VOID_FUNC(ub_memory_uncharge(struct mm_struct *mm, ++ unsigned long size, ++ unsigned vm_flags, ++ struct file *vm_file)) ++ ++struct shmem_inode_info; ++UB_DECLARE_FUNC(int, ub_shmpages_charge(struct shmem_inode_info *i, long sz)) ++UB_DECLARE_VOID_FUNC(ub_shmpages_uncharge(struct shmem_inode_info *i, long sz)) ++UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_inc(struct shmem_inode_info *shi)) ++UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_sub(struct shmem_inode_info *shi, ++ unsigned long size)) ++#define ub_tmpfs_respages_dec(shi) ub_tmpfs_respages_sub(shi, 1) ++ ++#ifdef CONFIG_USER_RESOURCE ++#define shmi_ub_set(shi, ub) do { \ ++ (shi)->shmi_ub = get_beancounter(ub); \ ++ } while (0) ++#define shmi_ub_put(shi) do { \ ++ put_beancounter((shi)->shmi_ub); \ ++ (shi)->shmi_ub = NULL; \ ++ } while (0) ++#else ++#define shmi_ub_set(shi, ub) do { } while (0) ++#define shmi_ub_put(shi) do { } while (0) ++#endif ++ ++UB_DECLARE_FUNC(int, ub_locked_charge(struct mm_struct *mm, ++ unsigned long size)) ++UB_DECLARE_VOID_FUNC(ub_locked_uncharge(struct mm_struct *mm, ++ unsigned long size)) ++UB_DECLARE_FUNC(int, ub_lockedshm_charge(struct shmem_inode_info *shi, ++ unsigned long size)) ++UB_DECLARE_VOID_FUNC(ub_lockedshm_uncharge(struct shmem_inode_info *shi, ++ unsigned long size)) ++ ++UB_DECLARE_FUNC(unsigned long, pages_in_vma_range(struct vm_area_struct *vma, ++ unsigned long addr, unsigned long end)) ++UB_DECLARE_VOID_FUNC(warn_bad_rss(struct vm_area_struct *vma, ++ unsigned long freed)) ++#define pages_in_vma(vma) (pages_in_vma_range(vma, \ ++ vma->vm_start, vma->vm_end)) ++ ++#define UB_PAGE_WEIGHT_SHIFT 24 ++#define UB_PAGE_WEIGHT (1 << UB_PAGE_WEIGHT_SHIFT) ++ ++struct page_beancounter; ++ ++/* Mprotect charging result */ ++#define PRIVVM_ERROR -1 ++#define PRIVVM_NO_CHARGE 0 ++#define PRIVVM_TO_PRIVATE 1 ++#define PRIVVM_TO_SHARED 2 ++ ++extern void fastcall __ub_update_physpages(struct user_beancounter *ub); ++extern void fastcall __ub_update_oomguarpages(struct user_beancounter *ub); ++extern void fastcall __ub_update_privvm(struct user_beancounter *ub); ++ ++#ifdef CONFIG_USER_RSS_ACCOUNTING ++#define PB_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl) ++#define PB_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl) ++#else ++#define PB_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;} ++#define PB_DECLARE_VOID_FUNC(decl) static inline void decl { } ++#endif ++ ++PB_DECLARE_FUNC(int, __pb_alloc(struct page_beancounter **pbc, gfp_t mask)) ++#define pb_alloc(pbp) (__pb_alloc(pbp, GFP_KERNEL)) ++PB_DECLARE_FUNC(int, pb_alloc_list(struct page_beancounter **pbc, int num)) ++PB_DECLARE_FUNC(int, pb_alloc_all(struct page_beancounter **pbc)) ++PB_DECLARE_FUNC(int, pb_add_ref(struct page *page, ++ struct mm_struct *mm, ++ struct page_beancounter **pbc)) ++PB_DECLARE_VOID_FUNC(pb_add_list_ref(struct page *page, ++ struct mm_struct *mm, ++ struct page_beancounter **pbc)) ++PB_DECLARE_VOID_FUNC(pb_free_list(struct page_beancounter **pb)) ++PB_DECLARE_VOID_FUNC(pb_free(struct page_beancounter **pb)) ++PB_DECLARE_VOID_FUNC(pb_remove_ref(struct page *page, ++ struct mm_struct *mm)) ++ ++PB_DECLARE_FUNC(struct user_beancounter *, pb_grab_page_ub(struct page *page)) ++#endif ++ ++#ifdef CONFIG_USER_SWAP_ACCOUNTING ++#define SWP_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl) ++#define SWP_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl) ++#else ++#define SWP_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;} ++#define SWP_DECLARE_VOID_FUNC(decl) static inline void decl { } ++#endif ++ ++struct swap_info_struct; ++SWP_DECLARE_FUNC(int, ub_swap_init(struct swap_info_struct *si, pgoff_t n)) ++SWP_DECLARE_VOID_FUNC(ub_swap_fini(struct swap_info_struct *si)) ++SWP_DECLARE_VOID_FUNC(ub_swapentry_inc(struct swap_info_struct *si, pgoff_t n, ++ struct user_beancounter *ub)) ++SWP_DECLARE_VOID_FUNC(ub_swapentry_dec(struct swap_info_struct *si, pgoff_t n)) ++ ++#ifdef CONFIG_USER_RESOURCE ++#define ub_unmap_inc(mm) do { \ ++ (mm)->mm_ub->ub_stat[smp_processor_id()].unmap++; \ ++ } while (0) ++#define ub_swapin_inc(mm) do { \ ++ (mm)->mm_ub->ub_stat[smp_processor_id()].swapin++; \ ++ } while (0) ++#else ++#define ub_unmap_inc(mm) do { } while (0) ++#define ub_swapin_inc(mm) do { } while (0) ++#endif +diff -uprN linux-2.6.15.orig/init/calibrate.c linux-2.6.15-ve025stab014/init/calibrate.c +--- linux-2.6.15.orig/init/calibrate.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/init/calibrate.c 2006-01-27 14:48:08.000000000 +0300 +@@ -7,6 +7,7 @@ + #include <linux/sched.h> + #include <linux/delay.h> + #include <linux/init.h> ++#include <linux/module.h> + + #include <asm/timex.h> + +@@ -105,6 +106,60 @@ static unsigned long __devinit calibrate + static unsigned long __devinit calibrate_delay_direct(void) {return 0;} + #endif + ++unsigned long cycles_per_jiffy, cycles_per_clock; ++ ++static __devinit void calibrate_cycles(void) ++{ ++ unsigned long ticks; ++ cycles_t time; ++ ++ ticks = jiffies; ++ while (ticks == jiffies) ++ /* nothing */; ++ time = get_cycles(); ++ ticks = jiffies; ++ while (ticks == jiffies) ++ /* nothing */; ++ ++ time = get_cycles() - time; ++ cycles_per_jiffy = time; ++ if ((time >> 32) != 0) { ++ printk("CPU too fast! timings are incorrect\n"); ++ cycles_per_jiffy = -1; ++ } ++} ++ ++EXPORT_SYMBOL(cycles_per_jiffy); ++EXPORT_SYMBOL(cycles_per_clock); ++ ++static __devinit void calc_cycles_per_jiffy(void) ++{ ++#if defined(__i386__) ++ extern unsigned long fast_gettimeoffset_quotient; ++ unsigned long low, high; ++ ++ if (fast_gettimeoffset_quotient != 0) { ++ __asm__("divl %2" ++ :"=a" (low), "=d" (high) ++ :"r" (fast_gettimeoffset_quotient), ++ "0" (0), "1" (1000000/HZ)); ++ ++ cycles_per_jiffy = low; ++ } ++#endif ++ if (cycles_per_jiffy == 0) ++ calibrate_cycles(); ++ ++ if (cycles_per_jiffy == 0) { ++ printk(KERN_WARNING "Cycles are stuck! " ++ "Some VPS statistics will not be available."); ++ /* to prevent division by zero in cycles_to_(clocks|jiffies) */ ++ cycles_per_jiffy = 1; ++ cycles_per_clock = 1; ++ } else ++ cycles_per_clock = cycles_per_jiffy * (HZ / CLOCKS_PER_SEC); ++} ++ + /* + * This is the number of bits of precision for the loops_per_jiffy. Each + * bit takes on average 1.5/HZ seconds. This (like the original) is a little +@@ -170,4 +225,5 @@ void __devinit calibrate_delay(void) + loops_per_jiffy); + } + ++ calc_cycles_per_jiffy(); + } +diff -uprN linux-2.6.15.orig/init/main.c linux-2.6.15-ve025stab014/init/main.c +--- linux-2.6.15.orig/init/main.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/init/main.c 2006-01-27 14:48:08.000000000 +0300 +@@ -49,6 +49,8 @@ + #include <linux/key.h> + #include <net/sock.h> + ++#include <ub/beancounter.h> ++ + #include <asm/io.h> + #include <asm/bugs.h> + #include <asm/setup.h> +@@ -108,6 +110,20 @@ extern void tc_init(void); + enum system_states system_state; + EXPORT_SYMBOL(system_state); + ++#ifdef CONFIG_VE ++extern void init_ve_system(void); ++extern void prepare_ve0_process(struct task_struct *tsk); ++extern void prepare_ve0_proc_root(void); ++extern void prepare_ve0_sysctl(void); ++extern void prepare_ve0_loopback(void); ++#else ++#define init_ve_system() do { } while (0) ++#define prepare_ve0_process(tsk) do { } while (0) ++#define prepare_ve0_proc_root() do { } while (0) ++#define prepare_ve0_sysctl() do { } while (0) ++#define prepare_ve0_loopback() do { } while (0) ++#endif ++ + /* + * Boot command-line arguments + */ +@@ -451,6 +467,10 @@ asmlinkage void __init start_kernel(void + * enable them + */ + lock_kernel(); ++ /* ++ * Prepare ub0 to account early allocations if any ++ */ ++ ub_init_ub0(); + page_address_init(); + printk(KERN_NOTICE); + printk(linux_banner); +@@ -463,6 +483,8 @@ asmlinkage void __init start_kernel(void + */ + smp_prepare_boot_cpu(); + ++ prepare_ve0_process(&init_task); ++ + /* + * Set up the scheduler prior starting any interrupts (such as the + * timer interrupt). Full topology setup happens at smp_init() +@@ -526,6 +548,7 @@ asmlinkage void __init start_kernel(void + #endif + fork_init(num_physpages); + proc_caches_init(); ++ ub_init_cache(num_physpages); + buffer_init(); + unnamed_dev_init(); + key_init(); +@@ -536,7 +559,10 @@ asmlinkage void __init start_kernel(void + /* rootfs populating might need page-writeback */ + page_writeback_init(); + #ifdef CONFIG_PROC_FS ++ prepare_ve0_proc_root(); ++ prepare_ve0_sysctl(); + proc_root_init(); ++ ub_init_proc(); + #endif + cpuset_init(); + +@@ -544,6 +570,10 @@ asmlinkage void __init start_kernel(void + + acpi_early_init(); /* before LAPIC and SMP init */ + ++#ifdef CONFIG_USER_RESOURCE ++ ub_init_pbc(); ++#endif ++ + /* Do the rest non-__init'ed, we're now alive */ + rest_init(); + } +@@ -605,6 +635,9 @@ static void __init do_initcalls(void) + */ + static void __init do_basic_setup(void) + { ++ prepare_ve0_loopback(); ++ init_ve_system(); ++ + /* drivers will send hotplug events */ + init_workqueues(); + usermodehelper_init(); +diff -uprN linux-2.6.15.orig/ipc/mqueue.c linux-2.6.15-ve025stab014/ipc/mqueue.c +--- linux-2.6.15.orig/ipc/mqueue.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/ipc/mqueue.c 2006-01-27 14:48:05.000000000 +0300 +@@ -1010,7 +1010,8 @@ retry: + goto out; + } + +- ret = netlink_attachskb(sock, nc, 0, MAX_SCHEDULE_TIMEOUT); ++ ret = netlink_attachskb(sock, nc, 0, ++ MAX_SCHEDULE_TIMEOUT, NULL); + if (ret == 1) + goto retry; + if (ret) { +diff -uprN linux-2.6.15.orig/ipc/msg.c linux-2.6.15-ve025stab014/ipc/msg.c +--- linux-2.6.15.orig/ipc/msg.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/ipc/msg.c 2006-01-27 14:48:08.000000000 +0300 +@@ -87,6 +87,45 @@ void __init msg_init (void) + sysvipc_msg_proc_show); + } + ++#ifdef CONFIG_VE ++void __init prepare_msg(void) ++{ ++ get_ve0()->_msg_ids = &msg_ids; ++ get_ve0()->_msg_ctlmax = msg_ctlmax; ++ get_ve0()->_msg_ctlmnb = msg_ctlmnb; ++ get_ve0()->_msg_ctlmni = msg_ctlmni; ++} ++ ++#define msg_ids (*(get_exec_env()->_msg_ids)) ++#define msg_ctlmax (get_exec_env()->_msg_ctlmax) ++#define msg_ctlmnb (get_exec_env()->_msg_ctlmnb) ++#define msg_ctlmni (get_exec_env()->_msg_ctlmni) ++ ++void init_ve_ipc_msg(void) ++{ ++ msg_ctlmax = MSGMAX; ++ msg_ctlmnb = MSGMNB; ++ msg_ctlmni = MSGMNI; ++ ipc_init_ids(&msg_ids, MSGMNI); ++} ++ ++void cleanup_ve_ipc_msg(void) ++{ ++ int i; ++ struct msg_queue *msq; ++ ++ down(&msg_ids.sem); ++ for (i = 0; i <= msg_ids.max_id; i++) { ++ msq = msg_lock(i); ++ if (msq == NULL) ++ continue; ++ ++ freeque(msq, i); ++ } ++ up(&msg_ids.sem); ++} ++#endif ++ + static int newque (key_t key, int msgflg) + { + int id; +@@ -449,7 +488,7 @@ asmlinkage long sys_msgctl (int msqid, i + ipcp = &msq->q_perm; + err = -EPERM; + if (current->euid != ipcp->cuid && +- current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) ++ current->euid != ipcp->uid && !capable(CAP_VE_SYS_ADMIN)) + /* We _could_ check for CAP_CHOWN above, but we don't */ + goto out_unlock_up; + +@@ -539,7 +578,7 @@ static inline int pipelined_send(struct + msr->r_msg = ERR_PTR(-E2BIG); + } else { + msr->r_msg = NULL; +- msq->q_lrpid = msr->r_tsk->pid; ++ msq->q_lrpid = virt_pid(msr->r_tsk); + msq->q_rtime = get_seconds(); + wake_up_process(msr->r_tsk); + smp_mb(); +@@ -621,7 +660,7 @@ asmlinkage long sys_msgsnd (int msqid, s + } + } + +- msq->q_lspid = current->tgid; ++ msq->q_lspid = virt_tgid(current); + msq->q_stime = get_seconds(); + + if(!pipelined_send(msq,msg)) { +@@ -717,7 +756,7 @@ asmlinkage long sys_msgrcv (int msqid, s + list_del(&msg->m_list); + msq->q_qnum--; + msq->q_rtime = get_seconds(); +- msq->q_lrpid = current->tgid; ++ msq->q_lrpid = virt_tgid(current); + msq->q_cbytes -= msg->m_ts; + atomic_sub(msg->m_ts,&msg_bytes); + atomic_dec(&msg_hdrs); +diff -uprN linux-2.6.15.orig/ipc/msgutil.c linux-2.6.15-ve025stab014/ipc/msgutil.c +--- linux-2.6.15.orig/ipc/msgutil.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/ipc/msgutil.c 2006-01-27 14:48:06.000000000 +0300 +@@ -17,6 +17,8 @@ + + #include "util.h" + ++#include <ub/ub_mem.h> ++ + struct msg_msgseg { + struct msg_msgseg* next; + /* the next part of the message follows immediately */ +@@ -36,7 +38,7 @@ struct msg_msg *load_msg(const void __us + if (alen > DATALEN_MSG) + alen = DATALEN_MSG; + +- msg = (struct msg_msg *)kmalloc(sizeof(*msg) + alen, GFP_KERNEL); ++ msg = (struct msg_msg *)ub_kmalloc(sizeof(*msg) + alen, GFP_KERNEL); + if (msg == NULL) + return ERR_PTR(-ENOMEM); + +@@ -56,7 +58,7 @@ struct msg_msg *load_msg(const void __us + alen = len; + if (alen > DATALEN_SEG) + alen = DATALEN_SEG; +- seg = (struct msg_msgseg *)kmalloc(sizeof(*seg) + alen, ++ seg = (struct msg_msgseg *)ub_kmalloc(sizeof(*seg) + alen, + GFP_KERNEL); + if (seg == NULL) { + err = -ENOMEM; +diff -uprN linux-2.6.15.orig/ipc/sem.c linux-2.6.15-ve025stab014/ipc/sem.c +--- linux-2.6.15.orig/ipc/sem.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/ipc/sem.c 2006-01-27 14:48:08.000000000 +0300 +@@ -77,6 +77,7 @@ + #include <asm/uaccess.h> + #include "util.h" + ++#include <ub/ub_mem.h> + + #define sem_lock(id) ((struct sem_array*)ipc_lock(&sem_ids,id)) + #define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) +@@ -123,6 +124,48 @@ void __init sem_init (void) + sysvipc_sem_proc_show); + } + ++#ifdef CONFIG_VE ++void __init prepare_sem(void) ++{ ++ get_ve0()->_sem_ids = &sem_ids; ++ get_ve0()->_used_sems = used_sems; ++ get_ve0()->_sem_ctls[0] = sem_ctls[0]; ++ get_ve0()->_sem_ctls[1] = sem_ctls[1]; ++ get_ve0()->_sem_ctls[2] = sem_ctls[2]; ++ get_ve0()->_sem_ctls[3] = sem_ctls[3]; ++} ++ ++#define sem_ids (*(get_exec_env()->_sem_ids)) ++#define used_sems (get_exec_env()->_used_sems) ++#define sem_ctls (get_exec_env()->_sem_ctls) ++ ++void init_ve_ipc_sem(void) ++{ ++ used_sems = 0; ++ sem_ctls[0] = SEMMSL; ++ sem_ctls[1] = SEMMNS; ++ sem_ctls[2] = SEMOPM; ++ sem_ctls[3] = SEMMNI; ++ ipc_init_ids(&sem_ids, SEMMNI); ++} ++ ++void cleanup_ve_ipc_sem(void) ++{ ++ int i; ++ struct sem_array *sma; ++ ++ down(&sem_ids.sem); ++ for (i = 0; i <= sem_ids.max_id; i++) { ++ sma = sem_lock(i); ++ if (sma == NULL) ++ continue; ++ ++ freeary(sma, i); ++ } ++ up(&sem_ids.sem); ++} ++#endif ++ + /* + * Lockless wakeup algorithm: + * Without the check/retry algorithm a lockless wakeup is possible: +@@ -742,7 +785,7 @@ static int semctl_main(int semid, int se + for (un = sma->undo; un; un = un->id_next) + un->semadj[semnum] = 0; + curr->semval = val; +- curr->sempid = current->tgid; ++ curr->sempid = virt_tgid(current); + sma->sem_ctime = get_seconds(); + /* maybe some queued-up processes were waiting for this */ + update_queue(sma); +@@ -822,7 +865,7 @@ static int semctl_down(int semid, int se + ipcp = &sma->sem_perm; + + if (current->euid != ipcp->cuid && +- current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) { ++ current->euid != ipcp->uid && !capable(CAP_VE_SYS_ADMIN)) { + err=-EPERM; + goto out_unlock; + } +@@ -943,7 +986,8 @@ static inline int get_undo_list(struct s + undo_list = current->sysvsem.undo_list; + if (!undo_list) { + size = sizeof(struct sem_undo_list); +- undo_list = (struct sem_undo_list *) kmalloc(size, GFP_KERNEL); ++ undo_list = (struct sem_undo_list *) ub_kmalloc(size, ++ GFP_KERNEL); + if (undo_list == NULL) + return -ENOMEM; + memset(undo_list, 0, size); +@@ -1007,7 +1051,8 @@ static struct sem_undo *find_undo(int se + ipc_rcu_getref(sma); + sem_unlock(sma); + +- new = (struct sem_undo *) kmalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); ++ new = (struct sem_undo *) ub_kmalloc(sizeof(struct sem_undo) + ++ sizeof(short)*nsems, GFP_KERNEL); + if (!new) { + ipc_lock_by_ptr(&sma->sem_perm); + ipc_rcu_putref(sma); +@@ -1065,7 +1110,7 @@ asmlinkage long sys_semtimedop(int semid + if (nsops > sc_semopm) + return -E2BIG; + if(nsops > SEMOPM_FAST) { +- sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL); ++ sops = ub_kmalloc(sizeof(*sops)*nsops,GFP_KERNEL); + if(sops==NULL) + return -ENOMEM; + } +@@ -1149,7 +1194,7 @@ retry_undos: + queue.sops = sops; + queue.nsops = nsops; + queue.undo = un; +- queue.pid = current->tgid; ++ queue.pid = virt_tgid(current); + queue.id = semid; + queue.alter = alter; + if (alter) +@@ -1319,7 +1364,7 @@ found: + sem->semval = 0; + if (sem->semval > SEMVMX) + sem->semval = SEMVMX; +- sem->sempid = current->tgid; ++ sem->sempid = virt_tgid(current); + } + } + sma->sem_otime = get_seconds(); +diff -uprN linux-2.6.15.orig/ipc/shm.c linux-2.6.15-ve025stab014/ipc/shm.c +--- linux-2.6.15.orig/ipc/shm.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/ipc/shm.c 2006-01-27 14:48:08.000000000 +0300 +@@ -29,9 +29,13 @@ + #include <linux/audit.h> + #include <linux/ptrace.h> + #include <linux/seq_file.h> ++#include <linux/shmem_fs.h> + + #include <asm/uaccess.h> + ++#include <ub/beancounter.h> ++#include <ub/ub_vmpages.h> ++ + #include "util.h" + + #define shm_flags shm_perm.mode +@@ -50,6 +54,7 @@ static struct ipc_ids shm_ids; + static int newseg (key_t key, int shmflg, size_t size); + static void shm_open (struct vm_area_struct *shmd); + static void shm_close (struct vm_area_struct *shmd); ++static void shm_destroy (struct shmid_kernel *shmd); + #ifdef CONFIG_PROC_FS + static int sysvipc_shm_proc_show(struct seq_file *s, void *it); + #endif +@@ -69,6 +74,62 @@ void __init shm_init (void) + sysvipc_shm_proc_show); + } + ++#ifdef CONFIG_VE ++void __init prepare_shm(void) ++{ ++ get_ve0()->_shm_ids = &shm_ids; ++ get_ve0()->_shm_ctlmax = shm_ctlmax; ++ get_ve0()->_shm_ctlall = shm_ctlall; ++ get_ve0()->_shm_ctlmni = shm_ctlmni; ++ get_ve0()->_shm_tot = shm_tot; ++} ++ ++#define shm_ids (*(get_exec_env()->_shm_ids)) ++#define shm_ctlmax (get_exec_env()->_shm_ctlmax) ++#define shm_ctlall (get_exec_env()->_shm_ctlall) ++#define shm_ctlmni (get_exec_env()->_shm_ctlmni) ++#define shm_total (get_exec_env()->_shm_tot) ++ ++void init_ve_ipc_shm(void) ++{ ++ shm_ctlmax = SHMMAX; ++ shm_ctlall = SHMALL; ++ shm_ctlmni = SHMMNI; ++ shm_total = 0; ++ ipc_init_ids(&shm_ids, 1); ++} ++ ++void cleanup_ve_ipc_shm(void) ++{ ++ int i; ++ struct shmid_kernel *shp; ++ ++ down(&shm_ids.sem); ++ for (i = 0; i <= shm_ids.max_id; i++) { ++ shp = shm_lock(i); ++ if (shp == NULL) ++ continue; ++ ++ if (shp->shm_nattch) { ++ shp->shm_flags |= SHM_DEST; ++ shp->shm_perm.key = IPC_PRIVATE; ++ shm_unlock(shp); ++ } else ++ shm_destroy(shp); ++ } ++ up(&shm_ids.sem); ++} ++#define sb_ve(sb) VE_OWNER_FSTYPE(sb->s_type) ++#define shm_total_sb(sb) (&sb_ve(sb)->_shm_tot) ++#define shm_lock_sb(id, sb) ((struct shmid_kernel *) \ ++ ipc_lock(sb_ve(sb)->_shm_ids, id)) ++#else ++/* renamed since there is a struct field named shm_tot */ ++#define shm_total shm_tot ++#define shm_total_sb(sb) (&shm_tot) ++#define shm_lock_sb(id, sb) shm_lock(id) ++#endif ++ + static inline int shm_checkid(struct shmid_kernel *s, int id) + { + if (ipc_checkid(&shm_ids,&s->shm_perm,id)) +@@ -76,9 +137,9 @@ static inline int shm_checkid(struct shm + return 0; + } + +-static inline struct shmid_kernel *shm_rmid(int id) ++static inline struct shmid_kernel *shm_rmid(struct ipc_ids *ids, int id) + { +- return (struct shmid_kernel *)ipc_rmid(&shm_ids,id); ++ return (struct shmid_kernel *)ipc_rmid(ids,id); + } + + static inline int shm_addid(struct shmid_kernel *shp) +@@ -88,13 +149,13 @@ static inline int shm_addid(struct shmid + + + +-static inline void shm_inc (int id) { ++static inline void shm_inc(int id, struct super_block *sb) { + struct shmid_kernel *shp; + +- if(!(shp = shm_lock(id))) ++ if(!(shp = shm_lock_sb(id, sb))) + BUG(); + shp->shm_atim = get_seconds(); +- shp->shm_lprid = current->tgid; ++ shp->shm_lprid = virt_tgid(current); + shp->shm_nattch++; + shm_unlock(shp); + } +@@ -102,7 +163,50 @@ static inline void shm_inc (int id) { + /* This is called by fork, once for every shm attach. */ + static void shm_open (struct vm_area_struct *shmd) + { +- shm_inc (shmd->vm_file->f_dentry->d_inode->i_ino); ++ shm_inc(shmd->vm_file->f_dentry->d_inode->i_ino, ++ shmd->vm_file->f_dentry->d_inode->i_sb); ++} ++ ++static int shmem_lock(struct shmid_kernel *shp, int lock, ++ struct user_struct *user) ++{ ++ struct file *file = shp->shm_file; ++ struct inode *inode = file->f_dentry->d_inode; ++ struct shmem_inode_info *info = SHMEM_I(inode); ++ unsigned long size; ++ ++ size = shp->shm_segsz + PAGE_SIZE - 1; ++ ++#ifdef CONFIG_SHMEM ++ spin_lock(&info->lock); ++ if (lock && !(info->flags & VM_LOCKED)) { ++ if (ub_lockedshm_charge(info, size) < 0) ++ goto out_ch; ++ ++ if (!user_shm_lock(inode->i_size, user)) ++ goto out_user; ++ info->flags |= VM_LOCKED; ++ } ++ if (!lock && (info->flags & VM_LOCKED) && user) { ++ ub_lockedshm_uncharge(info, size); ++ user_shm_unlock(inode->i_size, user); ++ info->flags &= ~VM_LOCKED; ++ } ++ spin_unlock(&info->lock); ++ return 0; ++ ++out_user: ++ ub_lockedshm_uncharge(info, size); ++out_ch: ++ spin_unlock(&info->lock); ++ return -ENOMEM; ++#else ++ if (lock && ub_lockedshm_charge(info, size)) ++ return -ENOMEM; ++ if (!lock) ++ ub_lockedshm_uncharge(info, size); ++ return 0; ++#endif + } + + /* +@@ -115,15 +219,24 @@ static void shm_open (struct vm_area_str + */ + static void shm_destroy (struct shmid_kernel *shp) + { +- shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; +- shm_rmid (shp->id); ++ int numpages, *shm_totalp; ++ struct file *f; ++ struct super_block *sb; ++ ++ f = shp->shm_file; ++ sb = f->f_dentry->d_inode->i_sb; ++ numpages = (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ shm_totalp = shm_total_sb(sb); ++ *shm_totalp -= numpages; ++ ++ shm_rmid (shp->_shm_ids, shp->id); + shm_unlock(shp); + if (!is_file_hugepages(shp->shm_file)) +- shmem_lock(shp->shm_file, 0, shp->mlock_user); ++ shmem_lock(shp, 0, shp->mlock_user); + else + user_shm_unlock(shp->shm_file->f_dentry->d_inode->i_size, + shp->mlock_user); +- fput (shp->shm_file); ++ fput(f); + security_shm_free(shp); + ipc_rcu_putref(shp); + } +@@ -139,12 +252,24 @@ static void shm_close (struct vm_area_st + struct file * file = shmd->vm_file; + int id = file->f_dentry->d_inode->i_ino; + struct shmid_kernel *shp; ++ struct super_block *sb; ++ struct ipc_ids *ids; ++#ifdef CONFIG_VE ++ struct ve_struct *ve; ++ ++ sb = file->f_dentry->d_inode->i_sb; ++ ve = get_ve(sb_ve(sb)); ++ ids = ve->_shm_ids; ++#else ++ sb = file->f_dentry->d_inode->i_sb; ++ ids = &shm_ids; ++#endif + +- down (&shm_ids.sem); ++ down (&ids->sem); + /* remove from the list of attaches of the shm segment */ +- if(!(shp = shm_lock(id))) ++ if(!(shp = shm_lock_sb(id, sb))) + BUG(); +- shp->shm_lprid = current->tgid; ++ shp->shm_lprid = virt_tgid(current); + shp->shm_dtim = get_seconds(); + shp->shm_nattch--; + if(shp->shm_nattch == 0 && +@@ -152,14 +277,18 @@ static void shm_close (struct vm_area_st + shm_destroy (shp); + else + shm_unlock(shp); +- up (&shm_ids.sem); ++ up(&ids->sem); ++#ifdef CONFIG_VE ++ put_ve(ve); ++#endif + } + + static int shm_mmap(struct file * file, struct vm_area_struct * vma) + { + file_accessed(file); + vma->vm_ops = &shm_vm_ops; +- shm_inc(file->f_dentry->d_inode->i_ino); ++ shm_inc(file->f_dentry->d_inode->i_ino, ++ file->f_dentry->d_inode->i_sb); + return 0; + } + +@@ -183,13 +312,13 @@ static int newseg (key_t key, int shmflg + struct shmid_kernel *shp; + int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; + struct file * file; +- char name[13]; ++ char name[26]; + int id; + + if (size < SHMMIN || size > shm_ctlmax) + return -EINVAL; + +- if (shm_tot + numpages >= shm_ctlall) ++ if (shm_total + numpages >= shm_ctlall) + return -ENOSPC; + + shp = ipc_rcu_alloc(sizeof(*shp)); +@@ -220,7 +349,11 @@ static int newseg (key_t key, int shmflg + if ((shmflg & SHM_NORESERVE) && + sysctl_overcommit_memory != OVERCOMMIT_NEVER) + acctflag = 0; ++#ifdef CONFIG_VE ++ sprintf (name, "VE%d.SYSV%08x", get_exec_env()->veid, key); ++#else + sprintf (name, "SYSV%08x", key); ++#endif + file = shmem_file_setup(name, size, acctflag); + } + error = PTR_ERR(file); +@@ -232,13 +365,14 @@ static int newseg (key_t key, int shmflg + if(id == -1) + goto no_id; + +- shp->shm_cprid = current->tgid; ++ shp->shm_cprid = virt_tgid(current); + shp->shm_lprid = 0; + shp->shm_atim = shp->shm_dtim = 0; + shp->shm_ctim = get_seconds(); + shp->shm_segsz = size; + shp->shm_nattch = 0; + shp->id = shm_buildid(id,shp->shm_perm.seq); ++ shp->_shm_ids = &shm_ids; + shp->shm_file = file; + file->f_dentry->d_inode->i_ino = shp->id; + +@@ -246,7 +380,7 @@ static int newseg (key_t key, int shmflg + if (!(shmflg & SHM_HUGETLB)) + file->f_op = &shm_file_operations; + +- shm_tot += numpages; ++ shm_total += numpages; + shm_unlock(shp); + return shp->id; + +@@ -463,7 +597,7 @@ asmlinkage long sys_shmctl (int shmid, i + down(&shm_ids.sem); + shm_info.used_ids = shm_ids.in_use; + shm_get_stat (&shm_info.shm_rss, &shm_info.shm_swp); +- shm_info.shm_tot = shm_tot; ++ shm_info.shm_tot = shm_total; + shm_info.swap_attempts = 0; + shm_info.swap_successes = 0; + err = shm_ids.max_id; +@@ -550,14 +684,14 @@ asmlinkage long sys_shmctl (int shmid, i + if(cmd==SHM_LOCK) { + struct user_struct * user = current->user; + if (!is_file_hugepages(shp->shm_file)) { +- err = shmem_lock(shp->shm_file, 1, user); ++ err = shmem_lock(shp, 1, user); + if (!err) { + shp->shm_flags |= SHM_LOCKED; + shp->mlock_user = user; + } + } + } else if (!is_file_hugepages(shp->shm_file)) { +- shmem_lock(shp->shm_file, 0, shp->mlock_user); ++ shmem_lock(shp, 0, shp->mlock_user); + shp->shm_flags &= ~SHM_LOCKED; + shp->mlock_user = NULL; + } +@@ -587,7 +721,7 @@ asmlinkage long sys_shmctl (int shmid, i + + if (current->euid != shp->shm_perm.uid && + current->euid != shp->shm_perm.cuid && +- !capable(CAP_SYS_ADMIN)) { ++ !capable(CAP_VE_SYS_ADMIN)) { + err=-EPERM; + goto out_unlock_up; + } +@@ -626,7 +760,7 @@ asmlinkage long sys_shmctl (int shmid, i + err=-EPERM; + if (current->euid != shp->shm_perm.uid && + current->euid != shp->shm_perm.cuid && +- !capable(CAP_SYS_ADMIN)) { ++ !capable(CAP_VE_SYS_ADMIN)) { + goto out_unlock_up; + } + +diff -uprN linux-2.6.15.orig/ipc/util.c linux-2.6.15-ve025stab014/ipc/util.c +--- linux-2.6.15.orig/ipc/util.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/ipc/util.c 2006-01-27 14:48:08.000000000 +0300 +@@ -13,6 +13,7 @@ + */ + + #include <linux/config.h> ++#include <linux/module.h> + #include <linux/mm.h> + #include <linux/shm.h> + #include <linux/init.h> +@@ -29,6 +30,8 @@ + + #include <asm/unistd.h> + ++#include <ub/ub_mem.h> ++ + #include "util.h" + + struct ipc_proc_iface { +@@ -64,7 +67,7 @@ __initcall(ipc_init); + * array itself. + */ + +-void __init ipc_init_ids(struct ipc_ids* ids, int size) ++void __ve_init ipc_init_ids(struct ipc_ids* ids, int size) + { + int i; + sema_init(&ids->sem,1); +@@ -93,6 +96,8 @@ void __init ipc_init_ids(struct ipc_ids* + ids->entries->size = size; + for(i=0;i<size;i++) + ids->entries->p[i] = NULL; ++ ++ ids->owner_env = get_exec_env(); + } + + #ifdef CONFIG_PROC_FS +@@ -228,7 +233,8 @@ int ipc_addid(struct ipc_ids* ids, struc + } + return -1; + found: +- ids->in_use++; ++ if (ids->in_use++ == 0) ++ (void)get_ve(ids->owner_env); + if (id > ids->max_id) + ids->max_id = id; + +@@ -275,7 +281,8 @@ struct kern_ipc_perm* ipc_rmid(struct ip + ids->entries->p[lid] = NULL; + if(p==NULL) + BUG(); +- ids->in_use--; ++ if (--ids->in_use == 0) ++ put_ve(ids->owner_env); + + if (lid == ids->max_id) { + do { +@@ -301,9 +308,9 @@ void* ipc_alloc(int size) + { + void* out; + if(size > PAGE_SIZE) +- out = vmalloc(size); ++ out = ub_vmalloc(size); + else +- out = kmalloc(size, GFP_KERNEL); ++ out = ub_kmalloc(size, GFP_KERNEL); + return out; + } + +@@ -386,14 +393,14 @@ void* ipc_rcu_alloc(int size) + * workqueue if necessary (for vmalloc). + */ + if (rcu_use_vmalloc(size)) { +- out = vmalloc(HDRLEN_VMALLOC + size); ++ out = ub_vmalloc(HDRLEN_VMALLOC + size); + if (out) { + out += HDRLEN_VMALLOC; + container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 1; + container_of(out, struct ipc_rcu_hdr, data)->refcount = 1; + } + } else { +- out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL); ++ out = ub_kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL); + if (out) { + out += HDRLEN_KMALLOC; + container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 0; +@@ -602,6 +609,80 @@ int ipc_checkid(struct ipc_ids* ids, str + return 0; + } + ++#ifdef CONFIG_VE ++void __init prepare_ipc(void) ++{ ++ prepare_msg(); ++ prepare_sem(); ++ prepare_shm(); ++} ++ ++int init_ve_ipc(struct ve_struct * envid) ++{ ++ envid->_msg_ids = kmalloc(sizeof(struct ipc_ids) + sizeof(void *), ++ GFP_KERNEL); ++ if (envid->_msg_ids == NULL) ++ goto out_nomem; ++ envid->_sem_ids = kmalloc(sizeof(struct ipc_ids) + sizeof(void *), ++ GFP_KERNEL); ++ if (envid->_sem_ids == NULL) ++ goto out_free_msg; ++ envid->_shm_ids = kmalloc(sizeof(struct ipc_ids) + sizeof(void *), ++ GFP_KERNEL); ++ if (envid->_shm_ids == NULL) ++ goto out_free_sem; ++ ++ init_ve_ipc_msg(); ++ init_ve_ipc_sem(); ++ init_ve_ipc_shm(); ++ return 0; ++ ++out_free_sem: ++ kfree(envid->_sem_ids); ++out_free_msg: ++ kfree(envid->_msg_ids); ++out_nomem: ++ return -ENOMEM; ++} ++ ++void ve_ipc_cleanup(void) ++{ ++ cleanup_ve_ipc_msg(); ++ cleanup_ve_ipc_sem(); ++ cleanup_ve_ipc_shm(); ++} ++ ++void ve_ipc_free(struct ve_struct *envid) ++{ ++ if (envid->_msg_ids) { ++ ipc_rcu_putref(envid->_msg_ids->entries); ++ kfree(envid->_msg_ids); ++ envid->_msg_ids = NULL; ++ } ++ if (envid->_sem_ids) { ++ ipc_rcu_putref(envid->_sem_ids->entries); ++ kfree(envid->_sem_ids); ++ envid->_sem_ids = NULL; ++ } ++ if (envid->_shm_ids) { ++ ipc_rcu_putref(envid->_shm_ids->entries); ++ kfree(envid->_shm_ids); ++ envid->_shm_ids = NULL; ++ } ++} ++ ++void fini_ve_ipc(struct ve_struct *ptr) ++{ ++ ve_ipc_cleanup(); ++ ve_ipc_free(ptr); ++} ++ ++EXPORT_SYMBOL(init_ve_ipc); ++EXPORT_SYMBOL(ve_ipc_cleanup); ++EXPORT_SYMBOL(ve_ipc_free); ++EXPORT_SYMBOL(fini_ve_ipc); ++#endif /* CONFIG_VE */ ++ + #ifdef __ARCH_WANT_IPC_PARSE_VERSION + + +diff -uprN linux-2.6.15.orig/ipc/util.h linux-2.6.15-ve025stab014/ipc/util.h +--- linux-2.6.15.orig/ipc/util.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/ipc/util.h 2006-01-27 14:48:08.000000000 +0300 +@@ -15,6 +15,22 @@ void sem_init (void); + void msg_init (void); + void shm_init (void); + ++#ifdef CONFIG_VE ++void prepare_msg(void); ++void prepare_sem(void); ++void prepare_shm(void); ++void init_ve_ipc_msg(void); ++void init_ve_ipc_sem(void); ++void init_ve_ipc_shm(void); ++void cleanup_ve_ipc_msg(void); ++void cleanup_ve_ipc_sem(void); ++void cleanup_ve_ipc_shm(void); ++ ++#define __ve_init ++#else ++#define __ve_init __init ++#endif ++ + struct ipc_id_ary { + int size; + struct kern_ipc_perm *p[0]; +@@ -28,10 +44,11 @@ struct ipc_ids { + struct semaphore sem; + struct ipc_id_ary nullentry; + struct ipc_id_ary* entries; ++ struct ve_struct *owner_env; + }; + + struct seq_file; +-void __init ipc_init_ids(struct ipc_ids* ids, int size); ++void __ve_init ipc_init_ids(struct ipc_ids *ids, int size); + #ifdef CONFIG_PROC_FS + void __init ipc_init_proc_interface(const char *path, const char *header, + struct ipc_ids *ids, +diff -uprN linux-2.6.15.orig/kernel/Kconfig.openvz linux-2.6.15-ve025stab014/kernel/Kconfig.openvz +--- linux-2.6.15.orig/kernel/Kconfig.openvz 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/Kconfig.openvz 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,48 @@ ++# Copyright (C) 2005 SWsoft ++# All rights reserved. ++# Licensing governed by "linux/COPYING.SWsoft" file. ++ ++config VE ++ bool "Virtual Environment support" ++ default y ++ help ++ This option adds support of virtual Linux running on the original box ++ with fully supported virtual network driver, tty subsystem and ++ configurable access for hardware and other resources. ++ ++config VE_CALLS ++ tristate "VE calls interface" ++ depends on VE ++ default m ++ help ++ This option controls how to build vzmon code containing VE calls. ++ By default it's build in module vzmon.o ++ ++config VE_SYSFS ++ bool "Enable sysfs support in Virtual Environments" ++ depends on VE ++ default y ++ help ++ This option enables sysfs support in Virtual Environments ++ ++config VE_NETDEV ++ tristate "VE networking" ++ depends on VE ++ default m ++ help ++ This option controls whether to build VE networking code. ++ ++config VE_IPTABLES ++ bool "VE netfiltering" ++ depends on VE && VE_NETDEV && INET && NETFILTER ++ default y ++ help ++ This option controls whether to build VE netfiltering code. ++ ++config VZ_WDOG ++ tristate "VE watchdog module" ++ depends on VE ++ default m ++ help ++ This option controls building of vzwdog module, which dumps ++ a lot of useful system info on console periodically. +diff -uprN linux-2.6.15.orig/kernel/Makefile linux-2.6.15-ve025stab014/kernel/Makefile +--- linux-2.6.15.orig/kernel/Makefile 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/Makefile 2006-01-27 14:48:09.000000000 +0300 +@@ -9,6 +9,15 @@ obj-y = sched.o fork.o exec_domain.o + rcupdate.o intermodule.o extable.o params.o posix-timers.o \ + kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o + ++obj-y += ub/ ++ ++obj-$(CONFIG_VE) += ve.o ++obj-$(CONFIG_VE) += veowner.o ++obj-$(CONFIG_VE_CALLS) += vzdev.o ++obj-$(CONFIG_VZ_WDOG) += vzwdog.o ++obj-$(CONFIG_VE_CALLS) += vzmon.o ++vzmon-objs = vecalls.o ++ + obj-$(CONFIG_FUTEX) += futex.o + obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o + obj-$(CONFIG_SMP) += cpu.o spinlock.o +@@ -33,6 +42,8 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o + obj-$(CONFIG_SECCOMP) += seccomp.o + obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o + ++obj-$(CONFIG_VE_CALLS) += vzcompat.o ++ + ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) + # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is + # needed for x86 only. Why this used to be enabled for all architectures is beyond +diff -uprN linux-2.6.15.orig/kernel/capability.c linux-2.6.15-ve025stab014/kernel/capability.c +--- linux-2.6.15.orig/kernel/capability.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/capability.c 2006-01-27 14:48:08.000000000 +0300 +@@ -24,6 +24,7 @@ EXPORT_SYMBOL(cap_bset); + * Locking rule: acquire this prior to tasklist_lock. + */ + static DEFINE_SPINLOCK(task_capability_lock); ++EXPORT_SYMBOL(task_capability_lock); + + /* + * For sys_getproccap() and sys_setproccap(), any of the three +@@ -66,8 +67,8 @@ asmlinkage long sys_capget(cap_user_head + spin_lock(&task_capability_lock); + read_lock(&tasklist_lock); + +- if (pid && pid != current->pid) { +- target = find_task_by_pid(pid); ++ if (pid && pid != virt_pid(current)) { ++ target = find_task_by_pid_ve(pid); + if (!target) { + ret = -ESRCH; + goto out; +@@ -99,9 +100,13 @@ static inline int cap_set_pg(int pgrp, k + int ret = -EPERM; + int found = 0; + +- do_each_task_pid(pgrp, PIDTYPE_PGID, g) { ++ pgrp = vpid_to_pid(pgrp); ++ if (pgrp < 0) ++ return ret; ++ ++ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, g) { + target = g; +- while_each_thread(g, target) { ++ while_each_thread_ve(g, target) { + if (!security_capset_check(target, effective, + inheritable, + permitted)) { +@@ -112,7 +117,7 @@ static inline int cap_set_pg(int pgrp, k + } + found = 1; + } +- } while_each_task_pid(pgrp, PIDTYPE_PGID, g); ++ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, g); + + if (!found) + ret = 0; +@@ -131,7 +136,7 @@ static inline int cap_set_all(kernel_cap + int ret = -EPERM; + int found = 0; + +- do_each_thread(g, target) { ++ do_each_thread_ve(g, target) { + if (target == current || target->pid == 1) + continue; + found = 1; +@@ -140,7 +145,7 @@ static inline int cap_set_all(kernel_cap + continue; + ret = 0; + security_capset_set(target, effective, inheritable, permitted); +- } while_each_thread(g, target); ++ } while_each_thread_ve(g, target); + + if (!found) + ret = 0; +@@ -187,7 +192,7 @@ asmlinkage long sys_capset(cap_user_head + if (get_user(pid, &header->pid)) + return -EFAULT; + +- if (pid && pid != current->pid && !capable(CAP_SETPCAP)) ++ if (pid && pid != virt_pid(current) && !capable(CAP_SETPCAP)) + return -EPERM; + + if (copy_from_user(&effective, &data->effective, sizeof(effective)) || +@@ -198,8 +203,8 @@ asmlinkage long sys_capset(cap_user_head + spin_lock(&task_capability_lock); + read_lock(&tasklist_lock); + +- if (pid > 0 && pid != current->pid) { +- target = find_task_by_pid(pid); ++ if (pid > 0 && pid != virt_pid(current)) { ++ target = find_task_by_pid_ve(pid); + if (!target) { + ret = -ESRCH; + goto out; +diff -uprN linux-2.6.15.orig/kernel/configs.c linux-2.6.15-ve025stab014/kernel/configs.c +--- linux-2.6.15.orig/kernel/configs.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/configs.c 2006-01-27 14:48:08.000000000 +0300 +@@ -89,8 +89,7 @@ static int __init ikconfig_init(void) + struct proc_dir_entry *entry; + + /* create the current config file */ +- entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, +- &proc_root); ++ entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, NULL); + if (!entry) + return -ENOMEM; + +diff -uprN linux-2.6.15.orig/kernel/cpu.c linux-2.6.15-ve025stab014/kernel/cpu.c +--- linux-2.6.15.orig/kernel/cpu.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/cpu.c 2006-01-27 14:48:08.000000000 +0300 +@@ -95,7 +95,7 @@ static inline void check_for_tasks(int c + struct task_struct *p; + + write_lock_irq(&tasklist_lock); +- for_each_process(p) { ++ for_each_process_all(p) { + if (task_cpu(p) == cpu && + (!cputime_eq(p->utime, cputime_zero) || + !cputime_eq(p->stime, cputime_zero))) +diff -uprN linux-2.6.15.orig/kernel/cpuset.c linux-2.6.15-ve025stab014/kernel/cpuset.c +--- linux-2.6.15.orig/kernel/cpuset.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/cpuset.c 2006-01-27 14:48:08.000000000 +0300 +@@ -857,7 +857,7 @@ static int attach_task(struct cpuset *cs + if (pid) { + read_lock(&tasklist_lock); + +- tsk = find_task_by_pid(pid); ++ tsk = find_task_by_pid_all(pid); + if (!tsk || tsk->flags & PF_EXITING) { + read_unlock(&tasklist_lock); + return -ESRCH; +@@ -1259,13 +1259,13 @@ static inline int pid_array_load(pid_t * + + read_lock(&tasklist_lock); + +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + if (p->cpuset == cs) { + pidarray[n++] = p->pid; + if (unlikely(n == npids)) + goto array_full; + } +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + + array_full: + read_unlock(&tasklist_lock); +diff -uprN linux-2.6.15.orig/kernel/exit.c linux-2.6.15-ve025stab014/kernel/exit.c +--- linux-2.6.15.orig/kernel/exit.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/exit.c 2006-01-27 14:48:08.000000000 +0300 +@@ -35,6 +35,8 @@ + #include <asm/pgtable.h> + #include <asm/mmu_context.h> + ++#include <ub/ub_oom.h> ++ + extern void sem_exit (void); + extern struct task_struct *child_reaper; + +@@ -55,18 +57,19 @@ static void __unhash_process(struct task + } + + REMOVE_LINKS(p); ++ REMOVE_VE_LINKS(p); + } + + void release_task(struct task_struct * p) + { + int zap_leader; + task_t *leader; +- struct dentry *proc_dentry; ++ struct dentry *proc_dentry[2]; + + repeat: + atomic_dec(&p->user->processes); + spin_lock(&p->proc_lock); +- proc_dentry = proc_pid_unhash(p); ++ proc_pid_unhash(p, proc_dentry); + write_lock_irq(&tasklist_lock); + if (unlikely(p->ptrace)) + __ptrace_unlink(p); +@@ -79,6 +82,8 @@ repeat: + * the process by __unhash_process. + */ + __unhash_process(p); ++ nr_zombie--; ++ nr_dead++; + + /* + * If we are the last non-leader member of the thread +@@ -106,6 +111,10 @@ repeat: + spin_unlock(&p->proc_lock); + proc_pid_flush(proc_dentry); + release_thread(p); ++#ifdef CONFIG_VE ++ if (atomic_dec_and_test(&VE_TASK_INFO(p)->owner_env->pcounter)) ++ do_env_cleanup(VE_TASK_INFO(p)->owner_env); ++#endif + put_task_struct(p); + + p = leader; +@@ -117,10 +126,10 @@ repeat: + + void unhash_process(struct task_struct *p) + { +- struct dentry *proc_dentry; ++ struct dentry *proc_dentry[2]; + + spin_lock(&p->proc_lock); +- proc_dentry = proc_pid_unhash(p); ++ proc_pid_unhash(p, proc_dentry); + write_lock_irq(&tasklist_lock); + __unhash_process(p); + write_unlock_irq(&tasklist_lock); +@@ -138,14 +147,16 @@ int session_of_pgrp(int pgrp) + struct task_struct *p; + int sid = -1; + ++ WARN_ON(is_virtual_pid(pgrp)); ++ + read_lock(&tasklist_lock); +- do_each_task_pid(pgrp, PIDTYPE_PGID, p) { ++ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) { + if (p->signal->session > 0) { + sid = p->signal->session; + goto out; + } +- } while_each_task_pid(pgrp, PIDTYPE_PGID, p); +- p = find_task_by_pid(pgrp); ++ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p); ++ p = find_task_by_pid_ve(pgrp); + if (p) + sid = p->signal->session; + out: +@@ -167,17 +178,19 @@ static int will_become_orphaned_pgrp(int + struct task_struct *p; + int ret = 1; + +- do_each_task_pid(pgrp, PIDTYPE_PGID, p) { ++ WARN_ON(is_virtual_pid(pgrp)); ++ ++ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) { + if (p == ignored_task + || p->exit_state +- || p->real_parent->pid == 1) ++ || virt_pid(p->real_parent) == 1) + continue; + if (process_group(p->real_parent) != pgrp + && p->real_parent->signal->session == p->signal->session) { + ret = 0; + break; + } +- } while_each_task_pid(pgrp, PIDTYPE_PGID, p); ++ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p); + return ret; /* (sighing) "Often!" */ + } + +@@ -185,6 +198,8 @@ int is_orphaned_pgrp(int pgrp) + { + int retval; + ++ WARN_ON(is_virtual_pid(pgrp)); ++ + read_lock(&tasklist_lock); + retval = will_become_orphaned_pgrp(pgrp, NULL); + read_unlock(&tasklist_lock); +@@ -197,7 +212,7 @@ static inline int has_stopped_jobs(int p + int retval = 0; + struct task_struct *p; + +- do_each_task_pid(pgrp, PIDTYPE_PGID, p) { ++ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) { + if (p->state != TASK_STOPPED) + continue; + +@@ -213,7 +228,7 @@ static inline int has_stopped_jobs(int p + + retval = 1; + break; +- } while_each_task_pid(pgrp, PIDTYPE_PGID, p); ++ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p); + return retval; + } + +@@ -260,6 +275,9 @@ void __set_special_pids(pid_t session, p + { + struct task_struct *curr = current; + ++ WARN_ON(is_virtual_pid(pgrp)); ++ WARN_ON(is_virtual_pid(session)); ++ + if (curr->signal->session != session) { + detach_pid(curr, PIDTYPE_SID); + curr->signal->session = session; +@@ -278,6 +296,7 @@ void set_special_pids(pid_t session, pid + __set_special_pids(session, pgrp); + write_unlock_irq(&tasklist_lock); + } ++EXPORT_SYMBOL(set_special_pids); + + /* + * Let kernel threads use this to say that they +@@ -607,13 +626,12 @@ static inline void reparent_thread(task_ + static inline void forget_original_parent(struct task_struct * father, + struct list_head *to_release) + { +- struct task_struct *p, *reaper = father; ++ struct task_struct *p, *tsk_reaper, *reaper = father; + struct list_head *_p, *_n; + + do { + reaper = next_thread(reaper); + if (reaper == father) { +- reaper = child_reaper; + break; + } + } while (reaper->exit_state); +@@ -635,9 +653,16 @@ static inline void forget_original_paren + /* if father isn't the real parent, then ptrace must be enabled */ + BUG_ON(father != p->real_parent && !ptrace); + ++ tsk_reaper = reaper; ++ if (tsk_reaper == father) ++#ifdef CONFIG_VE ++ tsk_reaper = VE_TASK_INFO(p)->owner_env->init_entry; ++ if (tsk_reaper == p) ++#endif ++ tsk_reaper = child_reaper; + if (father == p->real_parent) { +- /* reparent with a reaper, real father it's us */ +- choose_new_parent(p, reaper, child_reaper); ++ /* reparent with a tsk_reaper, real father it's us */ ++ choose_new_parent(p, tsk_reaper, child_reaper); + reparent_thread(p, father, 0); + } else { + /* reparent ptraced task to its real parent */ +@@ -658,7 +683,15 @@ static inline void forget_original_paren + } + list_for_each_safe(_p, _n, &father->ptrace_children) { + p = list_entry(_p,struct task_struct,ptrace_list); +- choose_new_parent(p, reaper, child_reaper); ++ ++ tsk_reaper = reaper; ++ if (tsk_reaper == father) ++#ifdef CONFIG_VE ++ tsk_reaper = VE_TASK_INFO(p)->owner_env->init_entry; ++ if (tsk_reaper == p) ++#endif ++ tsk_reaper = child_reaper; ++ choose_new_parent(p, tsk_reaper, child_reaper); + reparent_thread(p, father, 1); + } + } +@@ -754,6 +787,9 @@ static void exit_notify(struct task_stru + && !capable(CAP_KILL)) + tsk->exit_signal = SIGCHLD; + ++ if (tsk->exit_signal != -1 && t == child_reaper) ++ /* We dont want people slaying init. */ ++ tsk->exit_signal = SIGCHLD; + + /* If something other than our normal parent is ptracing us, then + * send it a SIGCHLD instead of honoring exit_signal. exit_signal +@@ -772,6 +808,7 @@ static void exit_notify(struct task_stru + unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT))) + state = EXIT_DEAD; + tsk->exit_state = state; ++ nr_zombie++; + + write_unlock_irq(&tasklist_lock); + +@@ -786,6 +823,82 @@ static void exit_notify(struct task_stru + release_task(tsk); + } + ++#ifdef CONFIG_VE ++/* ++ * Handle exitting of init process, it's a special case for VE. ++ */ ++static void do_initproc_exit(void) ++{ ++ struct task_struct *tsk; ++ struct ve_struct *env; ++ struct siginfo info; ++ struct task_struct *g, *p; ++ long delay = 1L; ++ ++ tsk = current; ++ env = VE_TASK_INFO(current)->owner_env; ++ if (env->init_entry != tsk) ++ return; ++ ++ if (ve_is_super(env) && tsk->pid == 1) ++ panic("Attempted to kill init!"); ++ ++ memset(&info, 0, sizeof(info)); ++ info.si_errno = 0; ++ info.si_code = SI_KERNEL; ++ info.si_pid = virt_pid(tsk); ++ info.si_uid = current->uid; ++ info.si_signo = SIGKILL; ++ ++ /* ++ * Here the VE changes its state into "not running". ++ * op_sem taken for write is a barrier to all VE manipulations from ++ * ioctl: it waits for operations currently in progress and blocks all ++ * subsequent operations until is_running is set to 0 and op_sem is ++ * released. ++ */ ++ down_write(&env->op_sem); ++ env->is_running = 0; ++ up_write(&env->op_sem); ++ ++ /* send kill to all processes of VE */ ++ read_lock(&tasklist_lock); ++ do_each_thread_ve(g, p) { ++ force_sig_info(SIGKILL, &info, p); ++ } while_each_thread_ve(g, p); ++ read_unlock(&tasklist_lock); ++ ++ /* wait for all init childs exit */ ++ while (atomic_read(&env->pcounter) > 1) { ++ if (sys_wait4(-1, NULL, __WALL | WNOHANG, NULL) > 0) ++ continue; ++ /* it was ENOCHLD or no more children somehow */ ++ if (atomic_read(&env->pcounter) == 1) ++ break; ++ ++ /* clear all signals to avoid wakeups */ ++ if (signal_pending(tsk)) ++ flush_signals(tsk); ++ /* we have child without signal sent */ ++ __set_current_state(TASK_INTERRUPTIBLE); ++ schedule_timeout(delay); ++ delay = (delay < HZ) ? (delay << 1) : HZ; ++ read_lock(&tasklist_lock); ++ do_each_thread_ve(g, p) { ++ if (p != tsk) ++ force_sig_info(SIGKILL, &info, p); ++ } while_each_thread_ve(g, p); ++ read_unlock(&tasklist_lock); ++ } ++ env->init_entry = child_reaper; ++ write_lock_irq(&tasklist_lock); ++ REMOVE_LINKS(tsk); ++ tsk->parent = tsk->real_parent = child_reaper; ++ SET_LINKS(tsk); ++ write_unlock_irq(&tasklist_lock); ++} ++#endif ++ + fastcall NORET_TYPE void do_exit(long code) + { + struct task_struct *tsk = current; +@@ -799,8 +912,12 @@ fastcall NORET_TYPE void do_exit(long co + panic("Aiee, killing interrupt handler!"); + if (unlikely(!tsk->pid)) + panic("Attempted to kill the idle task!"); ++#ifdef CONFIG_VE ++ do_initproc_exit(); ++#else + if (unlikely(tsk->pid == 1)) + panic("Attempted to kill init!"); ++#endif + if (tsk->io_context) + exit_io_context(); + +@@ -866,6 +983,7 @@ fastcall NORET_TYPE void do_exit(long co + tsk->exit_code = code; + proc_exit_connector(tsk); + exit_notify(tsk); ++ ub_oom_task_exit(tsk); + #ifdef CONFIG_NUMA + mpol_free(tsk->mempolicy); + tsk->mempolicy = NULL; +@@ -901,7 +1019,14 @@ asmlinkage long sys_exit(int error_code) + + task_t fastcall *next_thread(const task_t *p) + { +- return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID); ++ task_t *tsk; ++ ++ tsk = pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID); ++#ifdef CONFIG_VE ++ /* all threads should belong to ONE ve! */ ++ BUG_ON(VE_TASK_INFO(tsk)->owner_env != VE_TASK_INFO(p)->owner_env); ++#endif ++ return tsk; + } + + EXPORT_SYMBOL(next_thread); +@@ -951,14 +1076,19 @@ asmlinkage void sys_exit_group(int error + static int eligible_child(pid_t pid, int options, task_t *p) + { + if (pid > 0) { +- if (p->pid != pid) ++ if ((is_virtual_pid(pid) ? virt_pid(p) : p->pid) != pid) + return 0; + } else if (!pid) { + if (process_group(p) != process_group(current)) + return 0; + } else if (pid != -1) { +- if (process_group(p) != -pid) +- return 0; ++ if (__is_virtual_pid(-pid)) { ++ if (virt_pgid(p) != -pid) ++ return 0; ++ } else { ++ if (process_group(p) != -pid) ++ return 0; ++ } + } + + /* +@@ -1143,7 +1273,7 @@ static int wait_task_zombie(task_t *p, i + p->exit_state = EXIT_ZOMBIE; + return retval; + } +- retval = p->pid; ++ retval = get_task_pid(p); + if (p->real_parent != p->parent) { + write_lock_irq(&tasklist_lock); + /* Double-check with lock held. */ +@@ -1278,7 +1408,7 @@ bail_ref: + if (!retval && infop) + retval = put_user(p->uid, &infop->si_uid); + if (!retval) +- retval = p->pid; ++ retval = get_task_pid(p); + put_task_struct(p); + + BUG_ON(!retval); +diff -uprN linux-2.6.15.orig/kernel/fork.c linux-2.6.15-ve025stab014/kernel/fork.c +--- linux-2.6.15.orig/kernel/fork.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/fork.c 2006-01-27 14:48:08.000000000 +0300 +@@ -20,6 +20,7 @@ + #include <linux/vmalloc.h> + #include <linux/completion.h> + #include <linux/namespace.h> ++#include <linux/file.h> + #include <linux/personality.h> + #include <linux/mempolicy.h> + #include <linux/sem.h> +@@ -51,11 +52,15 @@ + #include <asm/cacheflush.h> + #include <asm/tlbflush.h> + ++#include <ub/ub_vmpages.h> ++#include <ub/ub_misc.h> ++ + /* + * Protected counters by write_lock_irq(&tasklist_lock) + */ + unsigned long total_forks; /* Handle normal Linux uptimes. */ + int nr_threads; /* The idle threads do not count.. */ ++EXPORT_SYMBOL(nr_threads); + + int max_threads; /* tunable limit on nr_threads */ + +@@ -102,6 +107,7 @@ static kmem_cache_t *mm_cachep; + + void free_task(struct task_struct *tsk) + { ++ ub_task_uncharge(tsk); + free_thread_info(tsk->thread_info); + free_task_struct(tsk); + } +@@ -119,6 +125,12 @@ void __put_task_struct(struct task_struc + free_uid(tsk->user); + put_group_info(tsk->group_info); + ++#ifdef CONFIG_VE ++ put_ve(VE_TASK_INFO(tsk)->owner_env); ++ write_lock_irq(&tasklist_lock); ++ nr_dead--; ++ write_unlock_irq(&tasklist_lock); ++#endif + if (!profile_handoff_task(tsk)) + free_task(tsk); + } +@@ -132,7 +144,7 @@ void __init fork_init(unsigned long memp + /* create a slab on which task_structs can be allocated */ + task_struct_cachep = + kmem_cache_create("task_struct", sizeof(struct task_struct), +- ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL); ++ ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_UBC, NULL, NULL); + #endif + + /* +@@ -163,22 +175,33 @@ static struct task_struct *dup_task_stru + + tsk = alloc_task_struct(); + if (!tsk) +- return NULL; ++ goto out; + + ti = alloc_thread_info(tsk); +- if (!ti) { +- free_task_struct(tsk); +- return NULL; +- } ++ if (!ti) ++ goto out_tsk; + + *tsk = *orig; + tsk->thread_info = ti; + setup_thread_stack(tsk, orig); + ++ if (test_ti_thread_flag(orig->thread_info, TIF_MEMDIE)) ++ goto out_ti; ++ ++ if (ub_task_charge(orig, tsk)) ++ goto out_ti; ++ + /* One for us, one for whoever does the "release_task()" (usually parent) */ + atomic_set(&tsk->usage,2); + atomic_set(&tsk->fs_excl, 0); + return tsk; ++ ++out_ti: ++ free_thread_info(ti); ++out_tsk: ++ free_task_struct(tsk); ++out: ++ return NULL; + } + + #ifdef CONFIG_MMU +@@ -216,7 +239,12 @@ static inline int dup_mmap(struct mm_str + -pages); + continue; + } ++ + charge = 0; ++ if (ub_memory_charge(mm, mpnt->vm_end - mpnt->vm_start, ++ mpnt->vm_flags & ~VM_LOCKED, ++ mpnt->vm_file, UB_HARD)) ++ goto fail_noch; + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + if (security_vm_enough_memory(len)) +@@ -235,6 +263,7 @@ static inline int dup_mmap(struct mm_str + tmp->vm_flags &= ~VM_LOCKED; + tmp->vm_mm = mm; + tmp->vm_next = NULL; ++ set_vma_rss(tmp, 0); + anon_vma_link(tmp); + file = tmp->vm_file; + if (file) { +@@ -263,7 +292,7 @@ static inline int dup_mmap(struct mm_str + rb_parent = &tmp->vm_rb; + + mm->map_count++; +- retval = copy_page_range(mm, oldmm, mpnt); ++ retval = copy_page_range(mm, oldmm, tmp, mpnt); + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); +@@ -280,6 +309,9 @@ out: + fail_nomem_policy: + kmem_cache_free(vm_area_cachep, tmp); + fail_nomem: ++ ub_memory_uncharge(mm, mpnt->vm_end - mpnt->vm_start, ++ mpnt->vm_flags & ~VM_LOCKED, mpnt->vm_file); ++fail_noch: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto out; +@@ -310,7 +342,8 @@ static inline void mm_free_pgd(struct mm + + #include <linux/init_task.h> + +-static struct mm_struct * mm_init(struct mm_struct * mm) ++static struct mm_struct * mm_init(struct mm_struct * mm, ++ struct task_struct *tsk) + { + atomic_set(&mm->mm_users, 1); + atomic_set(&mm->mm_count, 1); +@@ -325,11 +358,14 @@ static struct mm_struct * mm_init(struct + mm->ioctx_list = NULL; + mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->cached_hole_size = ~0UL; ++ set_mm_ub(mm, tsk); + + if (likely(!mm_alloc_pgd(mm))) { + mm->def_flags = 0; + return mm; + } ++ ++ put_mm_ub(mm); + free_mm(mm); + return NULL; + } +@@ -344,7 +380,7 @@ struct mm_struct * mm_alloc(void) + mm = allocate_mm(); + if (mm) { + memset(mm, 0, sizeof(*mm)); +- mm = mm_init(mm); ++ mm = mm_init(mm, NULL); + } + return mm; + } +@@ -359,6 +395,7 @@ void fastcall __mmdrop(struct mm_struct + BUG_ON(mm == &init_mm); + mm_free_pgd(mm); + destroy_context(mm); ++ put_mm_ub(mm); + free_mm(mm); + } + +@@ -478,7 +515,7 @@ static int copy_mm(unsigned long clone_f + + /* Copy the current MM stuff.. */ + memcpy(mm, oldmm, sizeof(*mm)); +- if (!mm_init(mm)) ++ if (!mm_init(mm, tsk)) + goto fail_nomem; + + if (init_new_context(tsk,mm)) +@@ -848,7 +885,7 @@ asmlinkage long sys_set_tid_address(int + { + current->clear_child_tid = tidptr; + +- return current->pid; ++ return virt_pid(current); + } + + /* +@@ -865,7 +902,7 @@ static task_t *copy_process(unsigned lon + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr, +- int pid) ++ int pid, long pid0) + { + int retval; + struct task_struct *p = NULL; +@@ -926,12 +963,20 @@ static task_t *copy_process(unsigned lon + p->did_exec = 0; + copy_flags(clone_flags, p); + p->pid = pid; ++#ifdef CONFIG_VE ++ set_virt_pid(p, alloc_vpid(p->pid, pid0 ? : -1)); ++ if (virt_pid(p) < 0) ++ goto bad_fork_cleanup_module; ++#endif + retval = -EFAULT; + if (clone_flags & CLONE_PARENT_SETTID) +- if (put_user(p->pid, parent_tidptr)) ++ if (put_user(virt_pid(p), parent_tidptr)) + goto bad_fork_cleanup; + + p->proc_dentry = NULL; ++#ifdef CONFIG_VE ++ p->ve_task_info.glob_proc_dentry = NULL; ++#endif + + INIT_LIST_HEAD(&p->children); + INIT_LIST_HEAD(&p->sibling); +@@ -974,8 +1019,13 @@ static task_t *copy_process(unsigned lon + #endif + + p->tgid = p->pid; +- if (clone_flags & CLONE_THREAD) ++ set_virt_tgid(p, virt_pid(p)); ++ set_virt_pgid(p, virt_pgid(current)); ++ set_virt_sid(p, virt_sid(current)); ++ if (clone_flags & CLONE_THREAD) { + p->tgid = current->tgid; ++ set_virt_tgid(p, virt_tgid(current)); ++ } + + if ((retval = security_task_alloc(p))) + goto bad_fork_cleanup_policy; +@@ -1124,6 +1174,12 @@ static task_t *copy_process(unsigned lon + if (unlikely(p->ptrace & PT_PTRACED)) + __ptrace_link(p, current->parent); + ++#ifdef CONFIG_VE ++ SET_VE_LINKS(p); ++ atomic_inc(&p->ve_task_info.owner_env->pcounter); ++ get_ve(p->ve_task_info.owner_env); ++ seqcount_init(&p->ve_task_info.wakeup_lock); ++#endif + attach_pid(p, PIDTYPE_PID, p->pid); + attach_pid(p, PIDTYPE_TGID, p->tgid); + if (thread_group_leader(p)) { +@@ -1174,6 +1230,11 @@ bad_fork_cleanup_policy: + mpol_free(p->mempolicy); + #endif + bad_fork_cleanup: ++#ifdef CONFIG_VE ++ if (virt_pid(p) != p->pid && virt_pid(p) > 0) ++ free_vpid(virt_pid(p), get_exec_env()); ++bad_fork_cleanup_module: ++#endif + if (p->binfmt) + module_put(p->binfmt->module); + bad_fork_cleanup_put_domain: +@@ -1198,7 +1259,7 @@ task_t * __devinit fork_idle(int cpu) + task_t *task; + struct pt_regs regs; + +- task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); ++ task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0, 0); + if (!task) + return ERR_PTR(-ENOMEM); + init_idle(task, cpu); +@@ -1228,12 +1289,13 @@ static inline int fork_traceflag (unsign + * It copies the process, and if successful kick-starts + * it and waits for it to finish using the VM if required. + */ +-long do_fork(unsigned long clone_flags, ++long do_fork_pid(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, +- int __user *child_tidptr) ++ int __user *child_tidptr, ++ long pid0) + { + struct task_struct *p; + int trace = 0; +@@ -1247,7 +1309,8 @@ long do_fork(unsigned long clone_flags, + clone_flags |= CLONE_PTRACE; + } + +- p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); ++ p = copy_process(clone_flags, stack_start, regs, stack_size, ++ parent_tidptr, child_tidptr, pid, pid0); + /* + * Do this prior waking up the new thread - the thread pointer + * might get invalid after that point, if the thread exits quickly. +@@ -1255,6 +1318,7 @@ long do_fork(unsigned long clone_flags, + if (!IS_ERR(p)) { + struct completion vfork; + ++ pid = virt_pid(p); + if (clone_flags & CLONE_VFORK) { + p->vfork_done = &vfork; + init_completion(&vfork); +@@ -1290,24 +1354,37 @@ long do_fork(unsigned long clone_flags, + return pid; + } + ++EXPORT_SYMBOL(do_fork_pid); ++ ++long do_fork(unsigned long clone_flags, ++ unsigned long stack_start, ++ struct pt_regs *regs, ++ unsigned long stack_size, ++ int __user *parent_tidptr, ++ int __user *child_tidptr) ++{ ++ return do_fork_pid(clone_flags, stack_start, regs, stack_size, ++ parent_tidptr, child_tidptr, 0); ++} ++ + void __init proc_caches_init(void) + { + sighand_cachep = kmem_cache_create("sighand_cache", + sizeof(struct sighand_struct), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); + signal_cachep = kmem_cache_create("signal_cache", + sizeof(struct signal_struct), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); + files_cachep = kmem_cache_create("files_cache", + sizeof(struct files_struct), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); + fs_cachep = kmem_cache_create("fs_cache", + sizeof(struct fs_struct), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), 0, +- SLAB_PANIC, NULL, NULL); ++ SLAB_PANIC|SLAB_UBC, NULL, NULL); + mm_cachep = kmem_cache_create("mm_struct", + sizeof(struct mm_struct), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); + } +diff -uprN linux-2.6.15.orig/kernel/irq/handle.c linux-2.6.15-ve025stab014/kernel/irq/handle.c +--- linux-2.6.15.orig/kernel/irq/handle.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/irq/handle.c 2006-01-27 14:48:06.000000000 +0300 +@@ -14,6 +14,8 @@ + + #include "internals.h" + ++#include <ub/beancounter.h> ++ + /* + * Linux has a controller-independent interrupt architecture. + * Every controller has a 'controller-template', that is used +@@ -80,10 +82,12 @@ fastcall int handle_IRQ_event(unsigned i + struct irqaction *action) + { + int ret, retval = 0, status = 0; ++ struct user_beancounter *ub; + + if (!(action->flags & SA_INTERRUPT)) + local_irq_enable(); + ++ ub = set_exec_ub(get_ub0()); + do { + ret = action->handler(irq, action->dev_id, regs); + if (ret == IRQ_HANDLED) +@@ -91,6 +95,7 @@ fastcall int handle_IRQ_event(unsigned i + retval |= ret; + action = action->next; + } while (action); ++ (void)set_exec_ub(ub); + + if (status & SA_SAMPLE_RANDOM) + add_interrupt_randomness(irq); +diff -uprN linux-2.6.15.orig/kernel/kmod.c linux-2.6.15-ve025stab014/kernel/kmod.c +--- linux-2.6.15.orig/kernel/kmod.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/kmod.c 2006-01-27 14:48:08.000000000 +0300 +@@ -78,6 +78,10 @@ int request_module(const char *fmt, ...) + #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ + static int kmod_loop_msg; + ++ /* Don't allow request_module() inside VE. */ ++ if (!ve_is_super(get_exec_env())) ++ return -EPERM; ++ + va_start(args, fmt); + ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); + va_end(args); +@@ -246,6 +250,9 @@ int call_usermodehelper_keys(char *path, + }; + DECLARE_WORK(work, __call_usermodehelper, &sub_info); + ++ if (!ve_is_super(get_exec_env())) ++ return -EPERM; ++ + if (!khelper_wq) + return -EBUSY; + +diff -uprN linux-2.6.15.orig/kernel/kthread.c linux-2.6.15-ve025stab014/kernel/kthread.c +--- linux-2.6.15.orig/kernel/kthread.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/kthread.c 2006-01-27 14:48:08.000000000 +0300 +@@ -114,7 +114,7 @@ static void keventd_create_kthread(void + create->result = ERR_PTR(pid); + } else { + wait_for_completion(&create->started); +- create->result = find_task_by_pid(pid); ++ create->result = find_task_by_pid_all(pid); + } + complete(&create->done); + } +diff -uprN linux-2.6.15.orig/kernel/panic.c linux-2.6.15-ve025stab014/kernel/panic.c +--- linux-2.6.15.orig/kernel/panic.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/panic.c 2006-01-27 14:48:08.000000000 +0300 +@@ -23,6 +23,8 @@ + int panic_timeout; + int panic_on_oops; + int tainted; ++int kernel_text_csum_broken; ++EXPORT_SYMBOL(kernel_text_csum_broken); + + EXPORT_SYMBOL(panic_timeout); + +@@ -155,7 +157,8 @@ const char *print_tainted(void) + { + static char buf[20]; + if (tainted) { +- snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c", ++ snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c", ++ kernel_text_csum_broken ? 'B' : ' ', + tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', + tainted & TAINT_FORCED_MODULE ? 'F' : ' ', + tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', +diff -uprN linux-2.6.15.orig/kernel/pid.c linux-2.6.15-ve025stab014/kernel/pid.c +--- linux-2.6.15.orig/kernel/pid.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/pid.c 2006-01-27 14:48:08.000000000 +0300 +@@ -27,6 +27,10 @@ + #include <linux/bootmem.h> + #include <linux/hash.h> + ++#ifdef CONFIG_VE ++static void __free_vpid(int vpid, struct ve_struct *ve); ++#endif ++ + #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) + static struct hlist_head *pid_hash[PIDTYPE_MAX]; + static int pidhash_shift; +@@ -57,8 +61,14 @@ typedef struct pidmap { + void *page; + } pidmap_t; + ++#ifdef CONFIG_VE ++#define PIDMAP_NRFREE (BITS_PER_PAGE/2) ++#else ++#define PIDMAP_NRFREE BITS_PER_PAGE ++#endif ++ + static pidmap_t pidmap_array[PIDMAP_ENTRIES] = +- { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; ++ { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(PIDMAP_NRFREE), NULL } }; + + static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); + +@@ -67,6 +77,8 @@ fastcall void free_pidmap(int pid) + pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; + int offset = pid & BITS_PER_PAGE_MASK; + ++ BUG_ON(__is_virtual_pid(pid) || pid == 1); ++ + clear_bit(offset, map->page); + atomic_inc(&map->nr_free); + } +@@ -77,6 +89,8 @@ int alloc_pidmap(void) + pidmap_t *map; + + pid = last + 1; ++ if (__is_virtual_pid(pid)) ++ pid += VPID_DIV; + if (pid >= pid_max) + pid = RESERVED_PIDS; + offset = pid & BITS_PER_PAGE_MASK; +@@ -106,6 +120,8 @@ int alloc_pidmap(void) + return pid; + } + offset = find_next_offset(map, offset); ++ if (__is_virtual_pid(offset)) ++ offset += VPID_DIV; + pid = mk_pid(map, offset); + /* + * find_next_offset() found a bit, the pid from it +@@ -143,6 +159,7 @@ struct pid * fastcall find_pid(enum pid_ + } + return NULL; + } ++EXPORT_SYMBOL(find_pid); + + int fastcall attach_pid(task_t *task, enum pid_type type, int nr) + { +@@ -201,13 +218,26 @@ void fastcall detach_pid(task_t *task, e + if (tmp != type && find_pid(tmp, nr)) + return; + ++#ifdef CONFIG_VE ++ __free_vpid(task->pids[type].vnr, VE_TASK_INFO(task)->owner_env); ++#endif + free_pidmap(nr); + } + + task_t *find_task_by_pid_type(int type, int nr) + { ++ BUG(); ++ return NULL; ++} ++ ++EXPORT_SYMBOL(find_task_by_pid_type); ++ ++task_t *find_task_by_pid_type_all(int type, int nr) ++{ + struct pid *pid; + ++ BUG_ON(nr != -1 && is_virtual_pid(nr)); ++ + pid = find_pid(type, nr); + if (!pid) + return NULL; +@@ -215,7 +245,35 @@ task_t *find_task_by_pid_type(int type, + return pid_task(&pid->pid_list, type); + } + +-EXPORT_SYMBOL(find_task_by_pid_type); ++EXPORT_SYMBOL(find_task_by_pid_type_all); ++ ++#ifdef CONFIG_VE ++ ++task_t *find_task_by_pid_type_ve(int type, int nr) ++{ ++ task_t *tsk; ++ int gnr = nr; ++ struct pid *pid; ++ ++ if (is_virtual_pid(nr)) { ++ gnr = __vpid_to_pid(nr); ++ if (unlikely(gnr == -1)) ++ return NULL; ++ } ++ ++ pid = find_pid(type, gnr); ++ if (!pid) ++ return NULL; ++ ++ tsk = pid_task(&pid->pid_list, type); ++ if (!ve_accessible(VE_TASK_INFO(tsk)->owner_env, get_exec_env())) ++ return NULL; ++ return tsk; ++} ++ ++EXPORT_SYMBOL(find_task_by_pid_type_ve); ++ ++#endif + + /* + * This function switches the PIDs if a non-leader thread calls +@@ -234,6 +292,9 @@ void switch_exec_pids(task_t *leader, ta + + leader->pid = leader->tgid = thread->pid; + thread->pid = thread->tgid; ++ set_virt_tgid(leader, virt_pid(thread)); ++ set_virt_pid(leader, virt_pid(thread)); ++ set_virt_pid(thread, virt_tgid(thread)); + + attach_pid(thread, PIDTYPE_PID, thread->pid); + attach_pid(thread, PIDTYPE_TGID, thread->tgid); +@@ -247,6 +308,337 @@ void switch_exec_pids(task_t *leader, ta + attach_pid(leader, PIDTYPE_SID, leader->signal->session); + } + ++#ifdef CONFIG_VE ++ ++/* Virtual PID bits. ++ * ++ * At the moment all internal structures in kernel store real global pid. ++ * The only place, where virtual PID is used, is at user frontend. We ++ * remap virtual pids obtained from user to global ones (vpid_to_pid) and ++ * map globals to virtuals before showing them to user (virt_pid_type). ++ * ++ * We hold virtual PIDs inside struct pid, so map global -> virtual is easy. ++ */ ++ ++pid_t _pid_type_to_vpid(int type, pid_t pid) ++{ ++ struct pid * p; ++ ++ if (unlikely(is_virtual_pid(pid))) ++ return -1; ++ ++ read_lock(&tasklist_lock); ++ p = find_pid(type, pid); ++ if (p) { ++ pid = p->vnr; ++ } else { ++ pid = -1; ++ } ++ read_unlock(&tasklist_lock); ++ return pid; ++} ++ ++pid_t pid_type_to_vpid(int type, pid_t pid) ++{ ++ int vpid; ++ ++ if (unlikely(pid <= 0)) ++ return pid; ++ ++ BUG_ON(is_virtual_pid(pid)); ++ ++ if (ve_is_super(get_exec_env())) ++ return pid; ++ ++ vpid = _pid_type_to_vpid(type, pid); ++ if (unlikely(vpid == -1)) { ++ /* It is allowed: global pid can be used everywhere. ++ * This can happen, when kernel remembers stray pids: ++ * signal queues, locks etc. ++ */ ++ vpid = pid; ++ } ++ return vpid; ++} ++ ++/* To map virtual pids to global we maintain special hash table. ++ * ++ * Mapping entries are allocated when a process with non-trivial ++ * mapping is forked, which is possible only after VE migrated. ++ * Mappings are destroyed, when a global pid is removed from global ++ * pidmap, which means we do not need to refcount mappings. ++ */ ++ ++static struct hlist_head *vpid_hash; ++ ++struct vpid_mapping ++{ ++ int vpid; ++ int veid; ++ int pid; ++ struct hlist_node link; ++}; ++ ++static kmem_cache_t *vpid_mapping_cachep; ++ ++static inline int vpid_hashfn(int vnr, int veid) ++{ ++ return hash_long((unsigned long)(vnr+(veid<<16)), pidhash_shift); ++} ++ ++struct vpid_mapping *__lookup_vpid_mapping(int vnr, int veid) ++{ ++ struct hlist_node *elem; ++ struct vpid_mapping *map; ++ ++ hlist_for_each_entry(map, elem, ++ &vpid_hash[vpid_hashfn(vnr, veid)], link) { ++ if (map->vpid == vnr && map->veid == veid) ++ return map; ++ } ++ return NULL; ++} ++ ++/* __vpid_to_pid() is raw version of vpid_to_pid(). It is to be used ++ * only under tasklist_lock. In some places we must use only this version ++ * (f.e. __kill_pg_info is called under write lock!) ++ * ++ * Caller should pass virtual pid. This function returns an error, when ++ * seeing a global pid. ++ */ ++int __vpid_to_pid(int pid) ++{ ++ struct vpid_mapping *map; ++ ++ if (unlikely(!is_virtual_pid(pid) || ve_is_super(get_exec_env()))) ++ return -1; ++ ++ if (!get_exec_env()->sparse_vpid) { ++ if (pid != 1) ++ return pid - VPID_DIV; ++ return get_exec_env()->init_entry->pid; ++ } ++ ++ map = __lookup_vpid_mapping(pid, VEID(get_exec_env())); ++ if (map) ++ return map->pid; ++ return -1; ++} ++ ++int vpid_to_pid(int pid) ++{ ++ /* User gave bad pid. It is his problem. */ ++ if (unlikely(pid <= 0)) ++ return pid; ++ ++ if (!is_virtual_pid(pid)) ++ return pid; ++ ++ read_lock(&tasklist_lock); ++ pid = __vpid_to_pid(pid); ++ read_unlock(&tasklist_lock); ++ return pid; ++} ++ ++/* VEs which never migrated have trivial "arithmetic" mapping pid <-> vpid: ++ * ++ * vpid == 1 -> ve->init_task->pid ++ * else pid & ~VPID_DIV ++ * ++ * In this case VE has ve->sparse_vpid = 0 and we do not use vpid hash table. ++ * ++ * When VE migrates and we see non-trivial mapping the first time, we ++ * scan process table and populate mapping hash table. ++ */ ++ ++static int add_mapping(int pid, int vpid, int veid, struct hlist_head *cache) ++{ ++ if (pid > 0 && vpid > 0 && !__lookup_vpid_mapping(vpid, veid)) { ++ struct vpid_mapping *m; ++ if (hlist_empty(cache)) { ++ m = kmem_cache_alloc(vpid_mapping_cachep, GFP_ATOMIC); ++ if (unlikely(m == NULL)) ++ return -ENOMEM; ++ } else { ++ m = hlist_entry(cache->first, struct vpid_mapping, link); ++ hlist_del(&m->link); ++ } ++ m->pid = pid; ++ m->vpid = vpid; ++ m->veid = veid; ++ hlist_add_head(&m->link, ++ &vpid_hash[vpid_hashfn(vpid, veid)]); ++ } ++ return 0; ++} ++ ++static int switch_to_sparse_mapping(int pid) ++{ ++ struct ve_struct *env = get_exec_env(); ++ struct hlist_head cache; ++ task_t *g, *t; ++ int pcount; ++ int err; ++ ++ /* Transition happens under write_lock_irq, so we try to make ++ * it more reliable and fast preallocating mapping entries. ++ * pcounter may be not enough, we could have lots of orphaned ++ * process groups and sessions, which also require mappings. ++ */ ++ INIT_HLIST_HEAD(&cache); ++ pcount = atomic_read(&env->pcounter); ++ err = -ENOMEM; ++ while (pcount > 0) { ++ struct vpid_mapping *m; ++ m = kmem_cache_alloc(vpid_mapping_cachep, GFP_KERNEL); ++ if (!m) ++ goto out; ++ hlist_add_head(&m->link, &cache); ++ pcount--; ++ } ++ ++ write_lock_irq(&tasklist_lock); ++ err = 0; ++ if (env->sparse_vpid) ++ goto out_unlock; ++ ++ err = -ENOMEM; ++ do_each_thread_ve(g, t) { ++ if (t->pid == pid) ++ continue; ++ if (add_mapping(t->pid, virt_pid(t), VEID(env), &cache)) ++ goto out_unlock; ++ } while_each_thread_ve(g, t); ++ ++ for_each_process_ve(t) { ++ if (t->pid == pid) ++ continue; ++ ++ if (add_mapping(t->tgid, virt_tgid(t), VEID(env), &cache)) ++ goto out_unlock; ++ if (add_mapping(t->signal->pgrp, virt_pgid(t), VEID(env), &cache)) ++ goto out_unlock; ++ if (add_mapping(t->signal->session, virt_sid(t), VEID(env), &cache)) ++ goto out_unlock; ++ } ++ env->sparse_vpid = 1; ++ err = 0; ++ ++out_unlock: ++ if (err) { ++ int i; ++ ++ for (i=0; i<(1<<pidhash_shift); i++) { ++ struct hlist_node *elem, *next; ++ struct vpid_mapping *map; ++ ++ hlist_for_each_entry_safe(map, elem, next, &vpid_hash[i], link) { ++ if (map->veid == VEID(env)) { ++ hlist_del(elem); ++ hlist_add_head(elem, &cache); ++ } ++ } ++ } ++ } ++ write_unlock_irq(&tasklist_lock); ++ ++out: ++ while (!hlist_empty(&cache)) { ++ struct vpid_mapping *m; ++ m = hlist_entry(cache.first, struct vpid_mapping, link); ++ hlist_del(&m->link); ++ kmem_cache_free(vpid_mapping_cachep, m); ++ } ++ return err; ++} ++ ++int alloc_vpid(int pid, int virt_pid) ++{ ++ int result; ++ struct vpid_mapping *m; ++ struct ve_struct *env = get_exec_env(); ++ ++ if (ve_is_super(env) || !env->virt_pids) ++ return pid; ++ ++ if (!env->sparse_vpid) { ++ if (virt_pid == -1) ++ return pid + VPID_DIV; ++ ++ if (virt_pid == 1 || virt_pid == pid + VPID_DIV) ++ return virt_pid; ++ ++ if ((result = switch_to_sparse_mapping(pid)) < 0) ++ return result; ++ } ++ ++ m = kmem_cache_alloc(vpid_mapping_cachep, GFP_KERNEL); ++ if (!m) ++ return -ENOMEM; ++ ++ m->pid = pid; ++ m->veid = VEID(env); ++ ++ result = (virt_pid == -1) ? pid + VPID_DIV : virt_pid; ++ ++ write_lock_irq(&tasklist_lock); ++ if (unlikely(__lookup_vpid_mapping(result, m->veid))) { ++ if (virt_pid > 0) { ++ result = -EEXIST; ++ goto out; ++ } ++ ++ /* No luck. Now we search for some not-existing vpid. ++ * It is weak place. We do linear search. */ ++ do { ++ result++; ++ if (!__is_virtual_pid(result)) ++ result += VPID_DIV; ++ if (result >= pid_max) ++ result = RESERVED_PIDS + VPID_DIV; ++ } while (__lookup_vpid_mapping(result, m->veid) != NULL); ++ ++ /* And set last_pid in hope future alloc_pidmap to avoid ++ * collisions after future alloc_pidmap() */ ++ last_pid = result - VPID_DIV; ++ } ++ if (result > 0) { ++ m->vpid = result; ++ hlist_add_head(&m->link, ++ &vpid_hash[vpid_hashfn(result, m->veid)]); ++ } ++out: ++ write_unlock_irq(&tasklist_lock); ++ if (result < 0) ++ kmem_cache_free(vpid_mapping_cachep, m); ++ return result; ++} ++ ++static void __free_vpid(int vpid, struct ve_struct *ve) ++{ ++ struct vpid_mapping *m; ++ ++ if (!ve->sparse_vpid) ++ return; ++ ++ if (!__is_virtual_pid(vpid) && (vpid != 1 || ve_is_super(ve))) ++ return; ++ ++ m = __lookup_vpid_mapping(vpid, ve->veid); ++ BUG_ON(m == NULL); ++ hlist_del(&m->link); ++ kmem_cache_free(vpid_mapping_cachep, m); ++} ++EXPORT_SYMBOL(alloc_vpid); ++ ++void free_vpid(int vpid, struct ve_struct *ve) ++{ ++ write_lock_irq(&tasklist_lock); ++ __free_vpid(vpid, ve); ++ write_unlock_irq(&tasklist_lock); ++} ++#endif ++ + /* + * The pid hash table is scaled according to the amount of memory in the + * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or +@@ -273,6 +665,14 @@ void __init pidhash_init(void) + for (j = 0; j < pidhash_size; j++) + INIT_HLIST_HEAD(&pid_hash[i][j]); + } ++ ++#ifdef CONFIG_VE ++ vpid_hash = alloc_bootmem(pidhash_size * sizeof(struct hlist_head)); ++ if (!vpid_hash) ++ panic("Could not alloc vpid_hash!\n"); ++ for (j = 0; j < pidhash_size; j++) ++ INIT_HLIST_HEAD(&vpid_hash[j]); ++#endif + } + + void __init pidmap_init(void) +@@ -289,4 +689,12 @@ void __init pidmap_init(void) + + for (i = 0; i < PIDTYPE_MAX; i++) + attach_pid(current, i, 0); ++ ++#ifdef CONFIG_VE ++ vpid_mapping_cachep = ++ kmem_cache_create("vpid_mapping", ++ sizeof(struct vpid_mapping), ++ __alignof__(struct vpid_mapping), ++ SLAB_PANIC|SLAB_UBC, NULL, NULL); ++#endif + } +diff -uprN linux-2.6.15.orig/kernel/posix-cpu-timers.c linux-2.6.15-ve025stab014/kernel/posix-cpu-timers.c +--- linux-2.6.15.orig/kernel/posix-cpu-timers.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/posix-cpu-timers.c 2006-01-27 14:48:08.000000000 +0300 +@@ -20,7 +20,7 @@ static int check_clock(clockid_t which_c + return 0; + + read_lock(&tasklist_lock); +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + if (!p || (CPUCLOCK_PERTHREAD(which_clock) ? + p->tgid != current->tgid : p->tgid != pid)) { + error = -EINVAL; +@@ -303,7 +303,7 @@ int posix_cpu_clock_get(clockid_t which_ + */ + struct task_struct *p; + read_lock(&tasklist_lock); +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + if (p) { + if (CPUCLOCK_PERTHREAD(which_clock)) { + if (p->tgid == current->tgid) { +@@ -347,7 +347,7 @@ int posix_cpu_timer_create(struct k_itim + if (pid == 0) { + p = current; + } else { +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + if (p && p->tgid != current->tgid) + p = NULL; + } +@@ -355,7 +355,7 @@ int posix_cpu_timer_create(struct k_itim + if (pid == 0) { + p = current->group_leader; + } else { +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + if (p && p->tgid != pid) + p = NULL; + } +diff -uprN linux-2.6.15.orig/kernel/posix-timers.c linux-2.6.15-ve025stab014/kernel/posix-timers.c +--- linux-2.6.15.orig/kernel/posix-timers.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/posix-timers.c 2006-01-27 14:48:08.000000000 +0300 +@@ -31,6 +31,7 @@ + * POSIX clocks & timers + */ + #include <linux/mm.h> ++#include <linux/module.h> + #include <linux/smp_lock.h> + #include <linux/interrupt.h> + #include <linux/slab.h> +@@ -48,6 +49,8 @@ + #include <linux/workqueue.h> + #include <linux/module.h> + ++#include <ub/beancounter.h> ++ + #ifndef div_long_long_rem + #include <asm/div64.h> + +@@ -258,7 +261,8 @@ static __init int init_posix_timers(void + register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); + + posix_timers_cache = kmem_cache_create("posix_timers_cache", +- sizeof (struct k_itimer), 0, 0, NULL, NULL); ++ sizeof (struct k_itimer), 0, ++ SLAB_UBC, NULL, NULL); + idr_init(&posix_timers_id); + return 0; + } +@@ -411,6 +415,13 @@ exit: + + int posix_timer_event(struct k_itimer *timr,int si_private) + { ++ int ret; ++ struct ve_struct *ve; ++ struct user_beancounter *ub; ++ ++ ve = set_exec_env(timr->it_process->ve_task_info.owner_env); ++ ub = set_exec_ub(timr->it_process->task_bc.task_ub); ++ + memset(&timr->sigq->info, 0, sizeof(siginfo_t)); + timr->sigq->info.si_sys_private = si_private; + /* +@@ -430,11 +441,11 @@ int posix_timer_event(struct k_itimer *t + + if (timr->it_sigev_notify & SIGEV_THREAD_ID) { + struct task_struct *leader; +- int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, ++ ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, + timr->it_process); + + if (likely(ret >= 0)) +- return ret; ++ goto out; + + timr->it_sigev_notify = SIGEV_SIGNAL; + leader = timr->it_process->group_leader; +@@ -442,8 +453,12 @@ int posix_timer_event(struct k_itimer *t + timr->it_process = leader; + } + +- return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, ++ ret = send_group_sigqueue(timr->it_sigev_signo, timr->sigq, + timr->it_process); ++out: ++ (void)set_exec_ub(ub); ++ (void)set_exec_env(ve); ++ return ret; + } + EXPORT_SYMBOL_GPL(posix_timer_event); + +@@ -518,7 +533,7 @@ static inline struct task_struct * good_ + struct task_struct *rtn = current->group_leader; + + if ((event->sigev_notify & SIGEV_THREAD_ID ) && +- (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) || ++ (!(rtn = find_task_by_pid_ve(event->sigev_notify_thread_id)) || + rtn->tgid != current->tgid || + (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) + return NULL; +@@ -1219,6 +1234,7 @@ int do_posix_clock_monotonic_gettime(str + { + return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp); + } ++EXPORT_SYMBOL(do_posix_clock_monotonic_gettime); + + int do_posix_clock_nosettime(clockid_t clockid, struct timespec *tp) + { +diff -uprN linux-2.6.15.orig/kernel/power/process.c linux-2.6.15-ve025stab014/kernel/power/process.c +--- linux-2.6.15.orig/kernel/power/process.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/power/process.c 2006-01-27 14:48:08.000000000 +0300 +@@ -67,7 +67,7 @@ int freeze_processes(void) + do { + todo = 0; + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + if (!freezeable(p)) + continue; + if (frozen(p)) +@@ -78,7 +78,7 @@ int freeze_processes(void) + signal_wake_up(p, 0); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + todo++; +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + read_unlock(&tasklist_lock); + yield(); /* Yield is okay here */ + if (todo && time_after(jiffies, start_time + TIMEOUT)) { +@@ -95,7 +95,7 @@ int freeze_processes(void) + */ + if (todo) { + read_lock(&tasklist_lock); +- do_each_thread(g, p) ++ do_each_thread_all(g, p) + if (freezing(p)) { + pr_debug(" clean up: %s\n", p->comm); + p->flags &= ~PF_FREEZE; +@@ -103,7 +103,7 @@ int freeze_processes(void) + recalc_sigpending_tsk(p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } +- while_each_thread(g, p); ++ while_each_thread_all(g, p); + read_unlock(&tasklist_lock); + return todo; + } +@@ -119,12 +119,12 @@ void thaw_processes(void) + + printk( "Restarting tasks..." ); + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + if (!freezeable(p)) + continue; + if (!thaw_process(p)) + printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + + read_unlock(&tasklist_lock); + schedule(); +diff -uprN linux-2.6.15.orig/kernel/power/swsusp.c linux-2.6.15-ve025stab014/kernel/power/swsusp.c +--- linux-2.6.15.orig/kernel/power/swsusp.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/power/swsusp.c 2006-01-27 14:48:06.000000000 +0300 +@@ -71,6 +71,9 @@ + #include <linux/crypto.h> + #include <asm/scatterlist.h> + ++#include <ub/beancounter.h> ++#include <ub/ub_vmpages.h> ++ + #include "power.h" + + #ifdef CONFIG_HIGHMEM +@@ -361,7 +364,7 @@ static int write_page(unsigned long addr + swp_entry_t entry; + int error = 0; + +- entry = get_swap_page(); ++ entry = get_swap_page(get_ub0()); + if (swp_offset(entry) && + swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { + error = rw_swap_page_sync(WRITE, entry, +diff -uprN linux-2.6.15.orig/kernel/printk.c linux-2.6.15-ve025stab014/kernel/printk.c +--- linux-2.6.15.orig/kernel/printk.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/printk.c 2006-01-27 14:48:08.000000000 +0300 +@@ -30,6 +30,7 @@ + #include <linux/smp.h> + #include <linux/security.h> + #include <linux/bootmem.h> ++#include <linux/vzratelimit.h> + #include <linux/syscalls.h> + + #include <asm/uaccess.h> +@@ -83,7 +84,7 @@ static int console_locked; + * It is also used in interesting ways to provide interlocking in + * release_console_sem(). + */ +-static DEFINE_SPINLOCK(logbuf_lock); ++DEFINE_SPINLOCK(logbuf_lock); + + #define LOG_BUF_MASK (log_buf_len-1) + #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) +@@ -122,6 +123,31 @@ static char *log_buf = __log_buf; + static int log_buf_len = __LOG_BUF_LEN; + static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */ + ++#ifdef CONFIG_VE ++ ++#define ve_log_wait (*(get_exec_env()->_log_wait)) ++#define ve_log_start (*(get_exec_env()->_log_start)) ++#define ve_log_end (*(get_exec_env()->_log_end)) ++#define ve_logged_chars (*(get_exec_env()->_logged_chars)) ++#define ve_log_buf (get_exec_env()->log_buf) ++#define ve_log_buf_len (ve_is_super(get_exec_env()) ? \ ++ log_buf_len : VE_DEFAULT_LOG_BUF_LEN) ++#define VE_LOG_BUF_MASK (ve_log_buf_len - 1) ++#define VE_LOG_BUF(idx) (ve_log_buf[(idx) & VE_LOG_BUF_MASK]) ++ ++#else ++ ++#define ve_log_wait log_wait ++#define ve_log_start log_start ++#define ve_log_end log_end ++#define ve_logged_chars logged_chars ++#define ve_log_buf log_buf ++#define ve_log_buf_len log_buf_len ++#define VE_LOG_BUF_MASK LOG_BUF_MASK ++#define VE_LOG_BUF(idx) LOG_BUF(idx) ++ ++#endif /* CONFIG_VE */ ++ + /* + * Setup a list of consoles. Called from init/main.c + */ +@@ -179,18 +205,18 @@ static int __init log_buf_len_setup(char + + spin_lock_irqsave(&logbuf_lock, flags); + log_buf_len = size; +- log_buf = new_log_buf; ++ ve_log_buf = new_log_buf; + +- offset = start = min(con_start, log_start); ++ offset = start = min(con_start, ve_log_start); + dest_idx = 0; +- while (start != log_end) { +- log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; ++ while (start != ve_log_end) { ++ ve_log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; + start++; + dest_idx++; + } +- log_start -= offset; ++ ve_log_start -= offset; + con_start -= offset; +- log_end -= offset; ++ ve_log_end -= offset; + spin_unlock_irqrestore(&logbuf_lock, flags); + + printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); +@@ -223,6 +249,10 @@ int do_syslog(int type, char __user *buf + char c; + int error = 0; + ++ if (!ve_is_super(get_exec_env()) && ++ (type == 6 || type == 7 || type == 8)) ++ goto out; ++ + error = security_syslog(type); + if (error) + return error; +@@ -243,15 +273,15 @@ int do_syslog(int type, char __user *buf + error = -EFAULT; + goto out; + } +- error = wait_event_interruptible(log_wait, +- (log_start - log_end)); ++ error = wait_event_interruptible(ve_log_wait, ++ (ve_log_start - ve_log_end)); + if (error) + goto out; + i = 0; + spin_lock_irq(&logbuf_lock); +- while (!error && (log_start != log_end) && i < len) { +- c = LOG_BUF(log_start); +- log_start++; ++ while (!error && (ve_log_start != ve_log_end) && i < len) { ++ c = VE_LOG_BUF(ve_log_start); ++ ve_log_start++; + spin_unlock_irq(&logbuf_lock); + error = __put_user(c,buf); + buf++; +@@ -277,15 +307,17 @@ int do_syslog(int type, char __user *buf + error = -EFAULT; + goto out; + } ++ if (ve_log_buf == NULL) ++ goto out; + count = len; +- if (count > log_buf_len) +- count = log_buf_len; ++ if (count > ve_log_buf_len) ++ count = ve_log_buf_len; + spin_lock_irq(&logbuf_lock); +- if (count > logged_chars) +- count = logged_chars; ++ if (count > ve_logged_chars) ++ count = ve_logged_chars; + if (do_clear) +- logged_chars = 0; +- limit = log_end; ++ ve_logged_chars = 0; ++ limit = ve_log_end; + /* + * __put_user() could sleep, and while we sleep + * printk() could overwrite the messages +@@ -294,9 +326,9 @@ int do_syslog(int type, char __user *buf + */ + for (i = 0; i < count && !error; i++) { + j = limit-1-i; +- if (j + log_buf_len < log_end) ++ if (j + ve_log_buf_len < ve_log_end) + break; +- c = LOG_BUF(j); ++ c = VE_LOG_BUF(j); + spin_unlock_irq(&logbuf_lock); + error = __put_user(c,&buf[count-1-i]); + cond_resched(); +@@ -320,7 +352,7 @@ int do_syslog(int type, char __user *buf + } + break; + case 5: /* Clear ring buffer */ +- logged_chars = 0; ++ ve_logged_chars = 0; + break; + case 6: /* Disable logging to console */ + console_loglevel = minimum_console_loglevel; +@@ -338,10 +370,10 @@ int do_syslog(int type, char __user *buf + error = 0; + break; + case 9: /* Number of chars in the log buffer */ +- error = log_end - log_start; ++ error = ve_log_end - ve_log_start; + break; + case 10: /* Size of the log buffer */ +- error = log_buf_len; ++ error = ve_log_buf_len; + break; + default: + error = -EINVAL; +@@ -365,7 +397,7 @@ static void __call_console_drivers(unsig + + for (con = console_drivers; con; con = con->next) { + if ((con->flags & CON_ENABLED) && con->write) +- con->write(con, &LOG_BUF(start), end - start); ++ con->write(con, &VE_LOG_BUF(start), end - start); + } + } + +@@ -377,11 +409,11 @@ static void _call_console_drivers(unsign + { + if (msg_log_level < console_loglevel && + console_drivers && start != end) { +- if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { ++ if ((start & VE_LOG_BUF_MASK) > (end & VE_LOG_BUF_MASK)) { + /* wrapped write */ +- __call_console_drivers(start & LOG_BUF_MASK, +- log_buf_len); +- __call_console_drivers(0, end & LOG_BUF_MASK); ++ __call_console_drivers(start & VE_LOG_BUF_MASK, ++ ve_log_buf_len); ++ __call_console_drivers(0, end & VE_LOG_BUF_MASK); + } else { + __call_console_drivers(start, end); + } +@@ -405,16 +437,16 @@ static void call_console_drivers(unsigne + start_print = start; + while (cur_index != end) { + if (msg_level < 0 && ((end - cur_index) > 2) && +- LOG_BUF(cur_index + 0) == '<' && +- LOG_BUF(cur_index + 1) >= '0' && +- LOG_BUF(cur_index + 1) <= '7' && +- LOG_BUF(cur_index + 2) == '>') { +- msg_level = LOG_BUF(cur_index + 1) - '0'; ++ VE_LOG_BUF(cur_index + 0) == '<' && ++ VE_LOG_BUF(cur_index + 1) >= '0' && ++ VE_LOG_BUF(cur_index + 1) <= '7' && ++ VE_LOG_BUF(cur_index + 2) == '>') { ++ msg_level = VE_LOG_BUF(cur_index + 1) - '0'; + cur_index += 3; + start_print = cur_index; + } + while (cur_index != end) { +- char c = LOG_BUF(cur_index); ++ char c = VE_LOG_BUF(cur_index); + + cur_index++; + if (c == '\n') { +@@ -439,14 +471,14 @@ static void call_console_drivers(unsigne + + static void emit_log_char(char c) + { +- LOG_BUF(log_end) = c; +- log_end++; +- if (log_end - log_start > log_buf_len) +- log_start = log_end - log_buf_len; +- if (log_end - con_start > log_buf_len) +- con_start = log_end - log_buf_len; +- if (logged_chars < log_buf_len) +- logged_chars++; ++ VE_LOG_BUF(ve_log_end) = c; ++ ve_log_end++; ++ if (ve_log_end - ve_log_start > ve_log_buf_len) ++ ve_log_start = ve_log_end - ve_log_buf_len; ++ if (ve_is_super(get_exec_env()) && ve_log_end - con_start > ve_log_buf_len) ++ con_start = ve_log_end - ve_log_buf_len; ++ if (ve_logged_chars < ve_log_buf_len) ++ ve_logged_chars++; + } + + /* +@@ -511,18 +543,68 @@ __attribute__((weak)) unsigned long long + * printf(3) + */ + ++static inline int ve_log_init(void) ++{ ++#ifdef CONFIG_VE ++ if (ve_log_buf != NULL) ++ return 0; ++ ++ if (ve_is_super(get_exec_env())) { ++ ve0._log_wait = &log_wait; ++ ve0._log_start = &log_start; ++ ve0._log_end = &log_end; ++ ve0._logged_chars = &logged_chars; ++ ve0.log_buf = log_buf; ++ return 0; ++ } ++ ++ ve_log_buf = kmalloc(ve_log_buf_len, GFP_ATOMIC); ++ if (!ve_log_buf) ++ return -ENOMEM; ++ ++ memset(ve_log_buf, 0, ve_log_buf_len); ++#endif ++ return 0; ++} ++ + asmlinkage int printk(const char *fmt, ...) + { + va_list args; + int r; ++ struct ve_struct *ve; + + va_start(args, fmt); ++ ve = set_exec_env(get_ve0()); + r = vprintk(fmt, args); ++ (void)set_exec_env(ve); + va_end(args); + + return r; + } + ++asmlinkage int ve_printk(int dst, const char *fmt, ...) ++{ ++ va_list args; ++ int printed_len; ++ ++ printed_len = 0; ++ if (ve_is_super(get_exec_env()) || (dst & VE0_LOG)) { ++ struct ve_struct *env; ++ va_start(args, fmt); ++ env = set_exec_env(get_ve0()); ++ printed_len = vprintk(fmt, args); ++ (void)set_exec_env(env); ++ va_end(args); ++ } ++ if (!ve_is_super(get_exec_env()) && (dst & VE_LOG)) { ++ va_start(args, fmt); ++ printed_len = vprintk(fmt, args); ++ va_end(args); ++ } ++ return printed_len; ++} ++EXPORT_SYMBOL(ve_printk); ++ + /* cpu currently holding logbuf_lock */ + static volatile unsigned int printk_cpu = UINT_MAX; + +@@ -533,6 +615,7 @@ asmlinkage int vprintk(const char *fmt, + char *p; + static char printk_buf[1024]; + static int log_level_unknown = 1; ++ int err, need_wake; + + preempt_disable(); + if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id()) +@@ -544,6 +627,12 @@ asmlinkage int vprintk(const char *fmt, + spin_lock_irqsave(&logbuf_lock, flags); + printk_cpu = smp_processor_id(); + ++ err = ve_log_init(); ++ if (err) { ++ spin_unlock_irqrestore(&logbuf_lock, flags); ++ return err; ++ } ++ + /* Emit the output into the temporary buffer */ + printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); + +@@ -615,7 +704,12 @@ asmlinkage int vprintk(const char *fmt, + spin_unlock_irqrestore(&logbuf_lock, flags); + goto out; + } +- if (!down_trylock(&console_sem)) { ++ if (!ve_is_super(get_exec_env())) { ++ need_wake = (ve_log_start != ve_log_end); ++ spin_unlock_irqrestore(&logbuf_lock, flags); ++ if (!oops_in_progress && need_wake) ++ wake_up_interruptible(&ve_log_wait); ++ } else if (!down_trylock(&console_sem)) { + console_locked = 1; + /* + * We own the drivers. We can drop the spinlock and let +@@ -754,12 +848,12 @@ void release_console_sem(void) + + for ( ; ; ) { + spin_lock_irqsave(&logbuf_lock, flags); +- wake_klogd |= log_start - log_end; +- if (con_start == log_end) ++ wake_klogd |= ve_log_start - ve_log_end; ++ if (con_start == ve_log_end) + break; /* Nothing to print */ + _con_start = con_start; +- _log_end = log_end; +- con_start = log_end; /* Flush */ ++ _log_end = ve_log_end; ++ con_start = ve_log_end; /* Flush */ + spin_unlock(&logbuf_lock); + call_console_drivers(_con_start, _log_end); + local_irq_restore(flags); +@@ -768,8 +862,8 @@ void release_console_sem(void) + console_may_schedule = 0; + up(&console_sem); + spin_unlock_irqrestore(&logbuf_lock, flags); +- if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) +- wake_up_interruptible(&log_wait); ++ if (wake_klogd && !oops_in_progress && waitqueue_active(&ve_log_wait)) ++ wake_up_interruptible(&ve_log_wait); + } + EXPORT_SYMBOL(release_console_sem); + +@@ -940,7 +1034,7 @@ void register_console(struct console *co + * for us. + */ + spin_lock_irqsave(&logbuf_lock, flags); +- con_start = log_start; ++ con_start = ve_log_start; + spin_unlock_irqrestore(&logbuf_lock, flags); + } + release_console_sem(); +@@ -1049,3 +1143,33 @@ int printk_ratelimit(void) + printk_ratelimit_burst); + } + EXPORT_SYMBOL(printk_ratelimit); ++ ++/* ++ * Rate limiting stuff. ++ */ ++int vz_ratelimit(struct vz_rate_info *p) ++{ ++ unsigned long cjif, djif; ++ unsigned long flags; ++ static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; ++ long new_bucket; ++ ++ spin_lock_irqsave(&ratelimit_lock, flags); ++ cjif = jiffies; ++ djif = cjif - p->last; ++ if (djif < p->interval) { ++ if (p->bucket >= p->burst) { ++ spin_unlock_irqrestore(&ratelimit_lock, flags); ++ return 0; ++ } ++ p->bucket++; ++ } else { ++ new_bucket = p->bucket - (djif / (unsigned)p->interval); ++ if (new_bucket < 0) ++ new_bucket = 0; ++ p->bucket = new_bucket + 1; ++ } ++ p->last = cjif; ++ spin_unlock_irqrestore(&ratelimit_lock, flags); ++ return 1; ++} +diff -uprN linux-2.6.15.orig/kernel/ptrace.c linux-2.6.15-ve025stab014/kernel/ptrace.c +--- linux-2.6.15.orig/kernel/ptrace.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/ptrace.c 2006-01-27 14:48:08.000000000 +0300 +@@ -135,7 +135,10 @@ static int may_attach(struct task_struct + smp_rmb(); + if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) + return -EPERM; +- ++ if (!task->mm->vps_dumpable && !ve_is_super(get_exec_env())) ++ return -EPERM; ++ if (!ve_accessible(VE_TASK_INFO(task)->owner_env, get_exec_env())) ++ return -EPERM; + return security_ptrace(current, task); + } + +@@ -445,7 +448,7 @@ static int ptrace_get_task_struct(long r + + ret = -ESRCH; + read_lock(&tasklist_lock); +- child = find_task_by_pid(pid); ++ child = find_task_by_pid_ve(pid); + if (child) + get_task_struct(child); + read_unlock(&tasklist_lock); +diff -uprN linux-2.6.15.orig/kernel/sched.c linux-2.6.15-ve025stab014/kernel/sched.c +--- linux-2.6.15.orig/kernel/sched.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/sched.c 2006-01-27 14:48:08.000000000 +0300 +@@ -219,6 +219,9 @@ struct runqueue { + */ + unsigned long nr_uninterruptible; + ++ unsigned long nr_sleeping; ++ unsigned long nr_stopped; ++ + unsigned long expired_timestamp; + unsigned long long timestamp_last_tick; + task_t *curr, *idle; +@@ -283,6 +286,11 @@ for (domain = rcu_dereference(cpu_rq(cpu + # define finish_arch_switch(prev) do { } while (0) + #endif + ++struct kernel_stat_glob kstat_glob; ++spinlock_t kstat_glb_lock = SPIN_LOCK_UNLOCKED; ++EXPORT_SYMBOL(kstat_glob); ++EXPORT_SYMBOL(kstat_glb_lock); ++ + #ifndef __ARCH_WANT_UNLOCKED_CTXSW + static inline int task_running(runqueue_t *rq, task_t *p) + { +@@ -373,6 +381,185 @@ static inline void task_rq_unlock(runque + spin_unlock_irqrestore(&rq->lock, *flags); + } + ++#ifdef CONFIG_VE ++#define ve_nr_iowait_inc(env, cpu) \ ++ do { \ ++ VE_CPU_STATS((env), (cpu))->nr_iowait++; \ ++ } while(0) ++#define ve_nr_iowait_dec(env, cpu) \ ++ do { \ ++ VE_CPU_STATS((env), (cpu))->nr_iowait--; \ ++ } while(0) ++#define ve_nr_unint_inc(env, cpu) \ ++ do { \ ++ VE_CPU_STATS((env), (cpu))->nr_unint++; \ ++ } while(0) ++#define ve_nr_unint_dec(env, cpu) \ ++ do { \ ++ VE_CPU_STATS((env), (cpu))->nr_unint--; \ ++ } while(0) ++ ++#define cycles_after(a, b) ((long long)(b) - (long long)(a) < 0) ++ ++cycles_t ve_sched_get_idle_time(struct ve_struct *ve, int cpu) ++{ ++ struct ve_cpu_stats *ve_stat; ++ unsigned v; ++ cycles_t strt, ret, cycles; ++ ++ ve_stat = VE_CPU_STATS(ve, cpu); ++ do { ++ v = read_seqcount_begin(&ve_stat->stat_lock); ++ ret = ve_stat->idle_time; ++ strt = ve_stat->strt_idle_time; ++ if (strt && nr_uninterruptible_ve(ve) == 0) { ++ cycles = get_cycles(); ++ if (cycles_after(cycles, strt)) ++ ret += cycles - strt; ++ } ++ } while (read_seqcount_retry(&ve_stat->stat_lock, v)); ++ return ret; ++} ++EXPORT_SYMBOL(ve_sched_get_idle_time); ++ ++cycles_t ve_sched_get_iowait_time(struct ve_struct *ve, int cpu) ++{ ++ struct ve_cpu_stats *ve_stat; ++ unsigned v; ++ cycles_t strt, ret, cycles; ++ ++ ve_stat = VE_CPU_STATS(ve, cpu); ++ do { ++ v = read_seqcount_begin(&ve_stat->stat_lock); ++ ret = ve_stat->iowait_time; ++ strt = ve_stat->strt_idle_time; ++ if (strt && nr_uninterruptible_ve(ve) > 0) { ++ cycles = get_cycles(); ++ if (cycles_after(cycles, strt)) ++ ret += cycles - strt; ++ } ++ } while (read_seqcount_retry(&ve_stat->stat_lock, v)); ++ return ret; ++} ++ ++EXPORT_SYMBOL(ve_sched_get_iowait_time); ++ ++static inline void ve_stop_idle(struct ve_struct *ve, ++ unsigned int cpu, cycles_t cycles) ++{ ++ struct ve_cpu_stats *ve_stat; ++ ++ ve_stat = VE_CPU_STATS(ve, cpu); ++ ++ write_seqcount_begin(&ve_stat->stat_lock); ++ if (ve_stat->strt_idle_time) { ++ if (cycles_after(cycles, ve_stat->strt_idle_time)) { ++ if (nr_uninterruptible_ve(ve) == 0) ++ ve_stat->idle_time += cycles - ++ ve_stat->strt_idle_time; ++ else ++ ve_stat->iowait_time += cycles - ++ ve_stat->strt_idle_time; ++ } ++ ve_stat->strt_idle_time = 0; ++ } ++ write_seqcount_end(&ve_stat->stat_lock); ++} ++ ++static inline void ve_strt_idle(struct ve_struct *ve, ++ unsigned int cpu, cycles_t cycles) ++{ ++ struct ve_cpu_stats *ve_stat; ++ ++ ve_stat = VE_CPU_STATS(ve, cpu); ++ ++ write_seqcount_begin(&ve_stat->stat_lock); ++ ve_stat->strt_idle_time = cycles; ++ write_seqcount_end(&ve_stat->stat_lock); ++} ++ ++#define ve_nr_running_inc(env, cpu, cycles) do { \ ++ if (++VE_CPU_STATS((env), (cpu))->nr_running == 1) \ ++ ve_stop_idle(env, cpu, cycles); \ ++ } while (0) ++#define ve_nr_running_dec(env, cpu, cyclses) do { \ ++ if (--VE_CPU_STATS((env), (cpu))->nr_running == 0) \ ++ ve_strt_idle(env, cpu, cycles); \ ++ } while (0) ++ ++void ve_sched_attach(struct ve_struct *envid) ++{ ++ struct task_struct *tsk; ++ unsigned int cpu; ++ cycles_t cycles; ++ ++ tsk = current; ++ preempt_disable(); ++ cycles = get_cycles(); ++ cpu = task_cpu(tsk); ++ ve_nr_running_dec(VE_TASK_INFO(tsk)->owner_env, cpu, cycles); ++ ve_nr_running_inc(envid, cpu, cycles); ++ preempt_enable(); ++} ++EXPORT_SYMBOL(ve_sched_attach); ++ ++static inline void write_wakeup_stamp(struct task_struct *p, cycles_t cyc) ++{ ++ struct ve_task_info *ti; ++ ++ ti = VE_TASK_INFO(p); ++ write_seqcount_begin(&ti->wakeup_lock); ++ ti->wakeup_stamp = cyc; ++ write_seqcount_end(&ti->wakeup_lock); ++} ++ ++static inline void update_sched_lat(struct task_struct *t, cycles_t cycles) ++{ ++ int cpu; ++ cycles_t ve_wstamp; ++ ++ /* safe due to runqueue lock */ ++ cpu = smp_processor_id(); ++ ve_wstamp = t->ve_task_info.wakeup_stamp; ++ ++ if (ve_wstamp && cycles > ve_wstamp) { ++ KSTAT_LAT_PCPU_ADD(&kstat_glob.sched_lat, ++ cpu, cycles - ve_wstamp); ++ KSTAT_LAT_PCPU_ADD(&t->ve_task_info.exec_env->sched_lat_ve, ++ cpu, cycles - ve_wstamp); ++ } ++} ++ ++static inline void update_ve_task_info(task_t *prev, cycles_t cycles) ++{ ++#ifdef CONFIG_FAIRSCHED ++ if (prev != this_pcpu()->idle) { ++#else ++ if (prev != this_rq()->idle) { ++#endif ++ VE_CPU_STATS(prev->ve_task_info.owner_env, ++ smp_processor_id())->used_time += ++ cycles - prev->ve_task_info.sched_time; ++ ++ prev->ve_task_info.sched_time = cycles; ++ } ++} ++ ++#else ++#define ve_nr_running_inc(env, cpu, cycles) do { } while(0) ++#define ve_nr_running_dec(env, cpu, cycles) do { } while(0) ++#define ve_nr_iowait_inc(env, cpu) do { } while(0) ++#define ve_nr_iowait_dec(env, cpu) do { } while(0) ++#define ve_nr_unint_inc(env, cpu) do { } while(0) ++#define ve_nr_unint_dec(env, cpu) do { } while(0) ++#define update_ve_task_info(prev, cycles) do { } while (0) ++#endif ++ ++unsigned long nr_zombie = 0; /* protected by tasklist_lock */ ++unsigned long nr_dead = 0; ++EXPORT_SYMBOL(nr_zombie); ++EXPORT_SYMBOL(nr_dead); ++ + #ifdef CONFIG_SCHEDSTATS + /* + * bump this up when changing the output format or the meaning of an existing +@@ -671,7 +858,7 @@ static inline void dec_prio_bias(runqueu + rq->prio_bias -= MAX_PRIO - prio; + } + +-static inline void inc_nr_running(task_t *p, runqueue_t *rq) ++static inline void inc_nr_running(task_t *p, runqueue_t *rq, cycles_t cycles) + { + rq->nr_running++; + if (rt_task(p)) { +@@ -684,9 +871,11 @@ static inline void inc_nr_running(task_t + inc_prio_bias(rq, p->prio); + } else + inc_prio_bias(rq, p->static_prio); ++ ++ ve_nr_running_inc(p->ve_task_info.owner_env, task_cpu(p), cycles); + } + +-static inline void dec_nr_running(task_t *p, runqueue_t *rq) ++static inline void dec_nr_running(task_t *p, runqueue_t *rq, cycles_t cycles) + { + rq->nr_running--; + if (rt_task(p)) { +@@ -694,6 +883,8 @@ static inline void dec_nr_running(task_t + dec_prio_bias(rq, p->prio); + } else + dec_prio_bias(rq, p->static_prio); ++ ++ ve_nr_running_dec(p->ve_task_info.owner_env, task_cpu(p), cycles); + } + #else + static inline void inc_prio_bias(runqueue_t *rq, int prio) +@@ -704,14 +895,16 @@ static inline void dec_prio_bias(runqueu + { + } + +-static inline void inc_nr_running(task_t *p, runqueue_t *rq) ++static inline void inc_nr_running(task_t *p, runqueue_t *rq, cycles_t cycles) + { + rq->nr_running++; ++ ve_nr_running_inc(p->ve_task_info.owner_env, task_cpu(p), cycles); + } + +-static inline void dec_nr_running(task_t *p, runqueue_t *rq) ++static inline void dec_nr_running(task_t *p, runqueue_t *rq, cycles_t cycles) + { + rq->nr_running--; ++ ve_nr_running_dec(p->ve_task_info.owner_env, task_cpu(p), cycles); + } + #endif + +@@ -720,8 +913,15 @@ static inline void dec_nr_running(task_t + */ + static inline void __activate_task(task_t *p, runqueue_t *rq) + { ++ cycles_t cycles; ++ ++#ifdef CONFIG_VE ++ cycles = get_cycles(); ++ write_wakeup_stamp(p, cycles); ++ p->ve_task_info.sleep_time += cycles; ++#endif + enqueue_task(p, rq->active); +- inc_nr_running(p, rq); ++ inc_nr_running(p, rq, cycles); + } + + /* +@@ -730,7 +930,7 @@ static inline void __activate_task(task_ + static inline void __activate_idle_task(task_t *p, runqueue_t *rq) + { + enqueue_task_head(p, rq->active); +- inc_nr_running(p, rq); ++ inc_nr_running(p, rq, 0); + } + + static int recalc_task_prio(task_t *p, unsigned long long now) +@@ -850,7 +1050,25 @@ static void activate_task(task_t *p, run + */ + static void deactivate_task(struct task_struct *p, runqueue_t *rq) + { +- dec_nr_running(p, rq); ++ cycles_t cycles; ++#ifdef CONFIG_VE ++ unsigned int cpu; ++ struct ve_struct *ve; ++ ++ cycles = get_cycles(); ++ cpu = task_cpu(p); ++ ve = p->ve_task_info.owner_env; ++ ++ p->ve_task_info.sleep_time -= cycles; ++#endif ++ if (p->state == TASK_UNINTERRUPTIBLE) ++ ve_nr_unint_inc(ve, cpu); ++ if (p->state == TASK_INTERRUPTIBLE) ++ rq->nr_sleeping++; ++ if (p->state == TASK_STOPPED) ++ rq->nr_stopped++; ++ ++ dec_nr_running(p, rq, cycles); + dequeue_task(p, p->array); + p->array = NULL; + } +@@ -1353,7 +1571,13 @@ out_set_cpu: + + out_activate: + #endif /* CONFIG_SMP */ +- if (old_state == TASK_UNINTERRUPTIBLE) { ++ if (old_state == TASK_INTERRUPTIBLE) ++ rq->nr_sleeping--; ++ else if (old_state == TASK_STOPPED) ++ rq->nr_stopped--; ++ else if (old_state == TASK_UNINTERRUPTIBLE) { ++ ve_nr_unint_dec(p->ve_task_info.owner_env, ++ smp_processor_id()); + rq->nr_uninterruptible--; + /* + * Tasks on involuntary sleep don't earn +@@ -1453,6 +1677,10 @@ void fastcall sched_fork(task_t *p, int + p->first_time_slice = 1; + current->time_slice >>= 1; + p->timestamp = sched_clock(); ++#ifdef CONFIG_VE ++ /*cosmetic: sleep till wakeup below*/ ++ p->ve_task_info.sleep_time -= get_cycles(); ++#endif + if (unlikely(!current->time_slice)) { + /* + * This case is rare, it happens when the parent has only +@@ -1509,7 +1737,7 @@ void fastcall wake_up_new_task(task_t *p + list_add_tail(&p->run_list, ¤t->run_list); + p->array = current->array; + p->array->nr_active++; +- inc_nr_running(p, rq); ++ inc_nr_running(p, rq, get_cycles()); + } + set_need_resched(); + } else +@@ -1653,7 +1881,7 @@ asmlinkage void schedule_tail(task_t *pr + preempt_enable(); + #endif + if (current->set_child_tid) +- put_user(current->pid, current->set_child_tid); ++ put_user(virt_pid(current), current->set_child_tid); + } + + /* +@@ -1701,6 +1929,7 @@ unsigned long nr_running(void) + + return sum; + } ++EXPORT_SYMBOL(nr_running); + + unsigned long nr_uninterruptible(void) + { +@@ -1719,6 +1948,8 @@ unsigned long nr_uninterruptible(void) + return sum; + } + ++EXPORT_SYMBOL(nr_uninterruptible); ++ + unsigned long long nr_context_switches(void) + { + unsigned long long i, sum = 0; +@@ -1729,6 +1960,8 @@ unsigned long long nr_context_switches(v + return sum; + } + ++EXPORT_SYMBOL(nr_context_switches); ++ + unsigned long nr_iowait(void) + { + unsigned long i, sum = 0; +@@ -1739,6 +1972,79 @@ unsigned long nr_iowait(void) + return sum; + } + ++EXPORT_SYMBOL(nr_iowait); ++ ++unsigned long nr_stopped(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_cpu(i) ++ sum += cpu_rq(i)->nr_stopped; ++ ++ return sum; ++} ++ ++EXPORT_SYMBOL(nr_stopped); ++ ++unsigned long nr_sleeping(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_cpu(i) ++ sum += cpu_rq(i)->nr_sleeping; ++ ++ return sum; ++} ++ ++EXPORT_SYMBOL(nr_sleeping); ++ ++#ifdef CONFIG_VE ++unsigned long nr_running_ve(struct ve_struct *ve) ++{ ++ int i; ++ long sum; ++ cpumask_t ve_cpus; ++ ++ sum = 0; ++ ve_cpu_online_map(ve, &ve_cpus); ++ for_each_cpu_mask(i, ve_cpus) ++ sum += VE_CPU_STATS(ve, i)->nr_running; ++ return (unsigned long)(sum < 0 ? 0 : sum); ++} ++ ++EXPORT_SYMBOL(nr_running_ve); ++ ++unsigned long nr_uninterruptible_ve(struct ve_struct *ve) ++{ ++ int i; ++ long sum; ++ cpumask_t ve_cpus; ++ ++ sum = 0; ++ ve_cpu_online_map(ve, &ve_cpus); ++ for_each_cpu_mask(i, ve_cpus) ++ sum += VE_CPU_STATS(ve, i)->nr_unint; ++ return (unsigned long)(sum < 0 ? 0 : sum); ++} ++ ++EXPORT_SYMBOL(nr_uninterruptible_ve); ++ ++unsigned long nr_iowait_ve(struct ve_struct *ve) ++{ ++ int i; ++ long sum; ++ cpumask_t ve_cpus; ++ ++ sum = 0; ++ ve_cpu_online_map(ve, &ve_cpus); ++ for_each_cpu_mask(i, ve_cpus) ++ sum += VE_CPU_STATS(ve, i)->nr_iowait; ++ return (unsigned long)(sum < 0 ? 0 : sum); ++} ++ ++EXPORT_SYMBOL(nr_iowait_ve); ++#endif ++ + #ifdef CONFIG_SMP + + /* +@@ -1853,10 +2159,14 @@ static inline + void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, + runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) + { ++ cycles_t cycles; ++ ++ cycles = get_cycles(); ++ + dequeue_task(p, src_array); +- dec_nr_running(p, src_rq); ++ dec_nr_running(p, src_rq, cycles); + set_task_cpu(p, this_cpu); +- inc_nr_running(p, this_rq); ++ inc_nr_running(p, this_rq, cycles); + enqueue_task(p, this_array); + p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + + this_rq->timestamp_last_tick; +@@ -2560,6 +2870,15 @@ unsigned long long current_sched_time(co + STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ + ((rq)->curr->static_prio > (rq)->best_expired_prio)) + ++#ifdef CONFIG_VE ++#define update_ve_cpu_time(p, time, tick) do { \ ++ VE_CPU_STATS((p)->ve_task_info.owner_env, \ ++ task_cpu(p))->time += tick; \ ++ } while (0) ++#else ++#define update_ve_cpu_time(p, time, tick) do { } while (0) ++#endif ++ + /* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to +@@ -2575,10 +2894,13 @@ void account_user_time(struct task_struc + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); +- if (TASK_NICE(p) > 0) ++ if (TASK_NICE(p) > 0) { + cpustat->nice = cputime64_add(cpustat->nice, tmp); +- else ++ update_ve_cpu_time(p, nice, tmp); ++ } else { + cpustat->user = cputime64_add(cpustat->user, tmp); ++ update_ve_cpu_time(p, user, tmp); ++ } + } + + /* +@@ -2595,9 +2917,11 @@ void account_system_time(struct task_str + cputime64_t tmp; + + p->stime = cputime_add(p->stime, cputime); ++ tmp = cputime_to_cputime64(cputime); ++ ++ update_ve_cpu_time(p, system, tmp); + + /* Add system time to cpustat. */ +- tmp = cputime_to_cputime64(cputime); + if (hardirq_count() - hardirq_offset) + cpustat->irq = cputime64_add(cpustat->irq, tmp); + else if (softirq_count()) +@@ -3099,11 +3423,30 @@ switch_tasks: + + sched_info_switch(prev, next); + if (likely(prev != next)) { ++ cycles_t cycles; ++ ++ cycles = get_cycles(); + next->timestamp = now; + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + ++#ifdef CONFIG_VE ++ prev->ve_task_info.sleep_stamp = cycles; ++ if (prev->state == TASK_RUNNING && prev != this_rq()->idle) ++ write_wakeup_stamp(prev, cycles); ++ update_sched_lat(next, cycles); ++ ++ /* because next & prev are protected with ++ * runqueue lock we may not worry about ++ * wakeup_stamp and sched_time protection ++ * (same thing in 'else' branch below) ++ */ ++ update_ve_task_info(prev, cycles); ++ next->ve_task_info.sched_time = cycles; ++ write_wakeup_stamp(next, 0); ++#endif ++ + prepare_task_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); +@@ -3113,8 +3456,10 @@ switch_tasks: + * frame will be invalid. + */ + finish_task_switch(this_rq(), prev); +- } else ++ } else { ++ update_ve_task_info(prev, get_cycles()); + spin_unlock_irq(&rq->lock); ++ } + + prev = current; + if (unlikely(reacquire_kernel_lock(prev) < 0)) +@@ -3680,7 +4025,7 @@ task_t *idle_task(int cpu) + */ + static inline task_t *find_process_by_pid(pid_t pid) + { +- return pid ? find_task_by_pid(pid) : current; ++ return pid ? find_task_by_pid_ve(pid) : current; + } + + /* Actually do priority change: must hold rq lock. */ +@@ -3732,7 +4077,7 @@ recheck: + /* + * Allow unprivileged RT tasks to decrease priority: + */ +- if (!capable(CAP_SYS_NICE)) { ++ if (!capable(CAP_SYS_ADMIN)) { + /* can't change policy */ + if (policy != p->policy && + !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) +@@ -4181,8 +4526,15 @@ void __sched io_schedule(void) + { + struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + ++#ifdef CONFIG_VE ++ struct ve_struct *ve; ++ ve = current->ve_task_info.owner_env; ++#endif ++ + atomic_inc(&rq->nr_iowait); ++ ve_nr_iowait_inc(ve, smp_processor_id()); + schedule(); ++ ve_nr_iowait_dec(ve, smp_processor_id()); + atomic_dec(&rq->nr_iowait); + } + +@@ -4193,8 +4545,15 @@ long __sched io_schedule_timeout(long ti + struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + long ret; + ++#ifdef CONFIG_VE ++ struct ve_struct *ve; ++ ve = current->ve_task_info.owner_env; ++#endif ++ + atomic_inc(&rq->nr_iowait); ++ ve_nr_iowait_inc(ve, smp_processor_id()); + ret = schedule_timeout(timeout); ++ ve_nr_iowait_dec(ve, smp_processor_id()); + atomic_dec(&rq->nr_iowait); + return ret; + } +@@ -4315,15 +4674,9 @@ static void show_task(task_t *p) + else + printk("?"); + #if (BITS_PER_LONG == 32) +- if (state == TASK_RUNNING) +- printk(" running "); +- else +- printk(" %08lX ", thread_saved_pc(p)); ++ printk(" %08lX ", (unsigned long)p); + #else +- if (state == TASK_RUNNING) +- printk(" running task "); +- else +- printk(" %016lx ", thread_saved_pc(p)); ++ printk(" %016lx ", (unsigned long)p); + #endif + #ifdef CONFIG_DEBUG_STACK_USAGE + { +@@ -4362,21 +4715,21 @@ void show_state(void) + #if (BITS_PER_LONG == 32) + printk("\n" + " sibling\n"); +- printk(" task PC pid father child younger older\n"); ++ printk(" task taskaddr pid father child younger older\n"); + #else + printk("\n" + " sibling\n"); +- printk(" task PC pid father child younger older\n"); ++ printk(" task taskaddr pid father child younger older\n"); + #endif + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow + * console might take alot of time: + */ + touch_nmi_watchdog(); + show_task(p); +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + + read_unlock(&tasklist_lock); + } +@@ -4655,13 +5008,13 @@ static void migrate_live_tasks(int src_c + + write_lock_irq(&tasklist_lock); + +- do_each_thread(t, tsk) { ++ do_each_thread_all(t, tsk) { + if (tsk == current) + continue; + + if (task_cpu(tsk) == src_cpu) + move_task_off_dead_cpu(src_cpu, tsk); +- } while_each_thread(t, tsk); ++ } while_each_thread_all(t, tsk); + + write_unlock_irq(&tasklist_lock); + } +@@ -5680,7 +6033,7 @@ void normalize_rt_tasks(void) + runqueue_t *rq; + + read_lock_irq(&tasklist_lock); +- for_each_process (p) { ++ for_each_process_all (p) { + if (!rt_task(p)) + continue; + +diff -uprN linux-2.6.15.orig/kernel/signal.c linux-2.6.15-ve025stab014/kernel/signal.c +--- linux-2.6.15.orig/kernel/signal.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/signal.c 2006-01-27 14:48:08.000000000 +0300 +@@ -25,11 +25,14 @@ + #include <linux/posix-timers.h> + #include <linux/signal.h> + #include <linux/audit.h> ++#include <linux/kmem_cache.h> + #include <asm/param.h> + #include <asm/uaccess.h> + #include <asm/unistd.h> + #include <asm/siginfo.h> + ++#include <ub/ub_misc.h> ++ + /* + * SLAB caches for signal bits. + */ +@@ -270,8 +273,13 @@ static struct sigqueue *__sigqueue_alloc + atomic_inc(&t->user->sigpending); + if (override_rlimit || + atomic_read(&t->user->sigpending) <= +- t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) ++ t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { + q = kmem_cache_alloc(sigqueue_cachep, flags); ++ if (q && ub_siginfo_charge(q, get_task_ub(t))) { ++ kmem_cache_free(sigqueue_cachep, q); ++ q = NULL; ++ } ++ } + if (unlikely(q == NULL)) { + atomic_dec(&t->user->sigpending); + } else { +@@ -288,6 +296,7 @@ static inline void __sigqueue_free(struc + return; + atomic_dec(&q->user->sigpending); + free_uid(q->user); ++ ub_siginfo_uncharge(q); + kmem_cache_free(sigqueue_cachep, q); + } + +@@ -513,7 +522,16 @@ static int __dequeue_signal(struct sigpe + { + int sig = 0; + +- sig = next_signal(pending, mask); ++ /* SIGKILL must have priority, otherwise it is quite easy ++ * to create an unkillable process, sending sig < SIGKILL ++ * to self */ ++ if (unlikely(sigismember(&pending->signal, SIGKILL))) { ++ if (!sigismember(mask, SIGKILL)) ++ sig = SIGKILL; ++ } ++ ++ if (likely(!sig)) ++ sig = next_signal(pending, mask); + if (sig) { + if (current->notifier) { + if (sigismember(current->notifier_mask, sig)) { +@@ -800,7 +818,7 @@ static int send_signal(int sig, struct s + q->info.si_signo = sig; + q->info.si_errno = 0; + q->info.si_code = SI_USER; +- q->info.si_pid = current->pid; ++ q->info.si_pid = virt_pid(current); + q->info.si_uid = current->uid; + break; + case (unsigned long) SEND_SIG_PRIV: +@@ -1110,13 +1128,18 @@ int __kill_pg_info(int sig, struct sigin + if (pgrp <= 0) + return -EINVAL; + ++ /* Use __vpid_to_pid(). This function is used under write_lock ++ * tasklist_lock. */ ++ if (is_virtual_pid(pgrp)) ++ pgrp = __vpid_to_pid(pgrp); ++ + success = 0; + retval = -ESRCH; +- do_each_task_pid(pgrp, PIDTYPE_PGID, p) { ++ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) { + int err = group_send_sig_info(sig, info, p); + success |= !err; + retval = err; +- } while_each_task_pid(pgrp, PIDTYPE_PGID, p); ++ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p); + return success ? 0 : retval; + } + +@@ -1139,7 +1162,7 @@ kill_proc_info(int sig, struct siginfo * + struct task_struct *p; + + read_lock(&tasklist_lock); +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + error = -ESRCH; + if (p) + error = group_send_sig_info(sig, info, p); +@@ -1158,7 +1181,7 @@ int kill_proc_info_as_uid(int sig, struc + return ret; + + read_lock(&tasklist_lock); +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + if (!p) { + ret = -ESRCH; + goto out_unlock; +@@ -1198,8 +1221,8 @@ static int kill_something_info(int sig, + struct task_struct * p; + + read_lock(&tasklist_lock); +- for_each_process(p) { +- if (p->pid > 1 && p->tgid != current->tgid) { ++ for_each_process_ve(p) { ++ if (virt_pid(p) > 1 && p->tgid != current->tgid) { + int err = group_send_sig_info(sig, info, p); + ++count; + if (err != -EPERM) +@@ -1467,9 +1490,17 @@ void do_notify_parent(struct task_struct + BUG_ON(!tsk->ptrace && + (tsk->group_leader != tsk || !thread_group_empty(tsk))); + ++#ifdef CONFIG_VE ++ /* Allow to send only SIGCHLD from VE */ ++ if (sig != SIGCHLD && ++ tsk->ve_task_info.owner_env != ++ tsk->parent->ve_task_info.owner_env) ++ sig = SIGCHLD; ++#endif ++ + info.si_signo = sig; + info.si_errno = 0; +- info.si_pid = tsk->pid; ++ info.si_pid = get_task_pid_ve(tsk, tsk->parent->ve_task_info.owner_env); + info.si_uid = tsk->uid; + + /* FIXME: find out whether or not this is supposed to be c*time. */ +@@ -1534,7 +1565,7 @@ static void do_notify_parent_cldstop(str + + info.si_signo = SIGCHLD; + info.si_errno = 0; +- info.si_pid = tsk->pid; ++ info.si_pid = get_task_pid_ve(tsk, VE_TASK_INFO(parent)->owner_env); + info.si_uid = tsk->uid; + + /* FIXME: find out whether or not this is supposed to be c*time. */ +@@ -1862,7 +1893,7 @@ relock: + info->si_signo = signr; + info->si_errno = 0; + info->si_code = SI_USER; +- info->si_pid = current->parent->pid; ++ info->si_pid = virt_pid(current->parent); + info->si_uid = current->parent->uid; + } + +@@ -1893,8 +1924,14 @@ relock: + continue; + + /* Init gets no signals it doesn't want. */ +- if (current->pid == 1) ++ if (virt_pid(current) == 1) { ++ /* Allow SIGKILL for non-root VE */ ++#ifdef CONFIG_VE ++ if (ve_is_super(get_exec_env()) || ++ signr != SIGKILL) ++#endif + continue; ++ } + + if (sig_kernel_stop(signr)) { + /* +@@ -2245,7 +2282,7 @@ sys_kill(int pid, int sig) + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_USER; +- info.si_pid = current->tgid; ++ info.si_pid = virt_tgid(current); + info.si_uid = current->uid; + + return kill_something_info(sig, &info, pid); +@@ -2261,12 +2298,12 @@ static int do_tkill(int tgid, int pid, i + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_TKILL; +- info.si_pid = current->tgid; ++ info.si_pid = virt_tgid(current); + info.si_uid = current->uid; + + read_lock(&tasklist_lock); +- p = find_task_by_pid(pid); +- if (p && (tgid <= 0 || p->tgid == tgid)) { ++ p = find_task_by_pid_ve(pid); ++ if (p && (tgid <= 0 || virt_tgid(p) == tgid)) { + error = check_kill_permission(sig, &info, p); + /* + * The null signal is a permissions and process existence +diff -uprN linux-2.6.15.orig/kernel/softirq.c linux-2.6.15-ve025stab014/kernel/softirq.c +--- linux-2.6.15.orig/kernel/softirq.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/softirq.c 2006-01-27 14:48:08.000000000 +0300 +@@ -17,6 +17,8 @@ + #include <linux/kthread.h> + #include <linux/rcupdate.h> + ++#include <ub/beancounter.h> ++ + #include <asm/irq.h> + /* + - No shared variables, all the data are CPU local. +@@ -73,10 +75,14 @@ static inline void wakeup_softirqd(void) + + asmlinkage void __do_softirq(void) + { ++ struct user_beancounter *ub; + struct softirq_action *h; + __u32 pending; + int max_restart = MAX_SOFTIRQ_RESTART; + int cpu; ++ struct ve_struct *envid; ++ ++ envid = set_exec_env(get_ve0()); + + pending = local_softirq_pending(); + +@@ -90,6 +96,7 @@ restart: + + h = softirq_vec; + ++ ub = set_exec_ub(get_ub0()); + do { + if (pending & 1) { + h->action(h); +@@ -98,6 +105,7 @@ restart: + h++; + pending >>= 1; + } while (pending); ++ (void)set_exec_ub(ub); + + local_irq_disable(); + +@@ -108,6 +116,7 @@ restart: + if (pending) + wakeup_softirqd(); + ++ (void)set_exec_env(envid); + __local_bh_enable(); + } + +diff -uprN linux-2.6.15.orig/kernel/sys.c linux-2.6.15-ve025stab014/kernel/sys.c +--- linux-2.6.15.orig/kernel/sys.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/sys.c 2006-01-27 14:48:08.000000000 +0300 +@@ -11,6 +11,7 @@ + #include <linux/mman.h> + #include <linux/smp_lock.h> + #include <linux/notifier.h> ++#include <linux/virtinfo.h> + #include <linux/reboot.h> + #include <linux/prctl.h> + #include <linux/init.h> +@@ -223,6 +224,94 @@ int unregister_reboot_notifier(struct no + + EXPORT_SYMBOL(unregister_reboot_notifier); + ++static DECLARE_MUTEX(virtinfo_sem); ++static struct vnotifier_block *virtinfo_chain[VIRT_TYPES]; ++ ++void virtinfo_notifier_register(int type, struct vnotifier_block *nb) ++{ ++ struct vnotifier_block **p; ++ ++ down(&virtinfo_sem); ++ for (p = &virtinfo_chain[type]; ++ *p != NULL && nb->priority < (*p)->priority; ++ p = &(*p)->next); ++ nb->next = *p; ++ smp_wmb(); ++ *p = nb; ++ up(&virtinfo_sem); ++} ++ ++EXPORT_SYMBOL(virtinfo_notifier_register); ++ ++struct virtinfo_cnt_struct { ++ volatile unsigned long exit[NR_CPUS]; ++ volatile unsigned long entry; ++}; ++static DEFINE_PER_CPU(struct virtinfo_cnt_struct, virtcnt); ++ ++void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb) ++{ ++ struct vnotifier_block **p; ++ int entry_cpu, exit_cpu; ++ unsigned long cnt, ent; ++ ++ down(&virtinfo_sem); ++ for (p = &virtinfo_chain[type]; *p != nb; p = &(*p)->next); ++ *p = nb->next; ++ smp_mb(); ++ ++ for_each_cpu_mask(entry_cpu, cpu_possible_map) { ++ while (1) { ++ cnt = 0; ++ for_each_cpu_mask(exit_cpu, cpu_possible_map) ++ cnt += ++ per_cpu(virtcnt, entry_cpu).exit[exit_cpu]; ++ smp_rmb(); ++ ent = per_cpu(virtcnt, entry_cpu).entry; ++ if (cnt == ent) ++ break; ++ __set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_timeout(HZ / 100); ++ } ++ } ++ up(&virtinfo_sem); ++} ++ ++EXPORT_SYMBOL(virtinfo_notifier_unregister); ++ ++int virtinfo_notifier_call(int type, unsigned long n, void *data) ++{ ++ int ret; ++ int entry_cpu, exit_cpu; ++ struct vnotifier_block *nb; ++ ++ entry_cpu = get_cpu(); ++ per_cpu(virtcnt, entry_cpu).entry++; ++ smp_wmb(); ++ put_cpu(); ++ ++ nb = virtinfo_chain[type]; ++ ret = NOTIFY_DONE; ++ while (nb) ++ { ++ ret = nb->notifier_call(nb, n, data, ret); ++ if(ret & NOTIFY_STOP_MASK) { ++ ret &= ~NOTIFY_STOP_MASK; ++ break; ++ } ++ nb = nb->next; ++ } ++ ++ exit_cpu = get_cpu(); ++ smp_wmb(); ++ per_cpu(virtcnt, entry_cpu).exit[exit_cpu]++; ++ put_cpu(); ++ ++ return ret; ++} ++ ++EXPORT_SYMBOL(virtinfo_notifier_call); ++ + static int set_one_prio(struct task_struct *p, int niceval, int error) + { + int no_nice; +@@ -268,17 +357,19 @@ asmlinkage long sys_setpriority(int whic + switch (which) { + case PRIO_PROCESS: + if (!who) +- who = current->pid; +- p = find_task_by_pid(who); ++ who = virt_pid(current); ++ p = find_task_by_pid_ve(who); + if (p) + error = set_one_prio(p, niceval, error); + break; + case PRIO_PGRP: + if (!who) + who = process_group(current); +- do_each_task_pid(who, PIDTYPE_PGID, p) { ++ else ++ who = vpid_to_pid(who); ++ do_each_task_pid_ve(who, PIDTYPE_PGID, p) { + error = set_one_prio(p, niceval, error); +- } while_each_task_pid(who, PIDTYPE_PGID, p); ++ } while_each_task_pid_ve(who, PIDTYPE_PGID, p); + break; + case PRIO_USER: + user = current->user; +@@ -288,10 +379,10 @@ asmlinkage long sys_setpriority(int whic + if ((who != current->uid) && !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ + +- do_each_thread(g, p) ++ do_each_thread_ve(g, p) + if (p->uid == who) + error = set_one_prio(p, niceval, error); +- while_each_thread(g, p); ++ while_each_thread_ve(g, p); + if (who != current->uid) + free_uid(user); /* For find_user() */ + break; +@@ -321,8 +412,8 @@ asmlinkage long sys_getpriority(int whic + switch (which) { + case PRIO_PROCESS: + if (!who) +- who = current->pid; +- p = find_task_by_pid(who); ++ who = virt_pid(current); ++ p = find_task_by_pid_ve(who); + if (p) { + niceval = 20 - task_nice(p); + if (niceval > retval) +@@ -332,11 +423,13 @@ asmlinkage long sys_getpriority(int whic + case PRIO_PGRP: + if (!who) + who = process_group(current); +- do_each_task_pid(who, PIDTYPE_PGID, p) { ++ else ++ who = vpid_to_pid(who); ++ do_each_task_pid_ve(who, PIDTYPE_PGID, p) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; +- } while_each_task_pid(who, PIDTYPE_PGID, p); ++ } while_each_task_pid_ve(who, PIDTYPE_PGID, p); + break; + case PRIO_USER: + user = current->user; +@@ -346,13 +439,13 @@ asmlinkage long sys_getpriority(int whic + if ((who != current->uid) && !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ + +- do_each_thread(g, p) ++ do_each_thread_ve(g, p) + if (p->uid == who) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } +- while_each_thread(g, p); ++ while_each_thread_ve(g, p); + if (who != current->uid) + free_uid(user); /* for find_user() */ + break; +@@ -489,6 +582,35 @@ asmlinkage long sys_reboot(int magic1, i + magic2 != LINUX_REBOOT_MAGIC2C)) + return -EINVAL; + ++#ifdef CONFIG_VE ++ if (!ve_is_super(get_exec_env())) ++ switch (cmd) { ++ case LINUX_REBOOT_CMD_RESTART: ++ case LINUX_REBOOT_CMD_HALT: ++ case LINUX_REBOOT_CMD_POWER_OFF: ++ case LINUX_REBOOT_CMD_RESTART2: { ++ struct siginfo info; ++ ++ info.si_errno = 0; ++ info.si_code = SI_KERNEL; ++ info.si_pid = virt_pid(current); ++ info.si_uid = current->uid; ++ info.si_signo = SIGKILL; ++ ++ /* Sending to real init is safe */ ++ send_sig_info(SIGKILL, &info, ++ get_exec_env()->init_entry); ++ } ++ ++ case LINUX_REBOOT_CMD_CAD_ON: ++ case LINUX_REBOOT_CMD_CAD_OFF: ++ return 0; ++ ++ default: ++ return -EINVAL; ++ } ++#endif ++ + lock_kernel(); + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART: +@@ -672,7 +794,7 @@ asmlinkage long sys_setgid(gid_t gid) + return 0; + } + +-static int set_user(uid_t new_ruid, int dumpclear) ++int set_user(uid_t new_ruid, int dumpclear) + { + struct user_struct *new_user; + +@@ -697,6 +819,7 @@ static int set_user(uid_t new_ruid, int + current->uid = new_ruid; + return 0; + } ++EXPORT_SYMBOL(set_user); + + /* + * Unprivileged users may change the real uid to the effective uid +@@ -1065,7 +1188,12 @@ asmlinkage long sys_times(struct tms __u + if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) + return -EFAULT; + } ++#ifndef CONFIG_VE + return (long) jiffies_64_to_clock_t(get_jiffies_64()); ++#else ++ return (long) jiffies_64_to_clock_t(get_jiffies_64() - ++ get_exec_env()->start_jiffies); ++#endif + } + + /* +@@ -1085,21 +1213,24 @@ asmlinkage long sys_setpgid(pid_t pid, p + { + struct task_struct *p; + int err = -EINVAL; ++ int _pgid; + + if (!pid) +- pid = current->pid; ++ pid = virt_pid(current); + if (!pgid) + pgid = pid; + if (pgid < 0) + return -EINVAL; + ++ _pgid = vpid_to_pid(pgid); ++ + /* From this point forward we keep holding onto the tasklist lock + * so that our parent does not change from under us. -DaveM + */ + write_lock_irq(&tasklist_lock); + + err = -ESRCH; +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + if (!p) + goto out; + +@@ -1124,25 +1255,35 @@ asmlinkage long sys_setpgid(pid_t pid, p + if (p->signal->leader) + goto out; + +- if (pgid != pid) { ++ pgid = virt_pid(p); ++ if (_pgid != p->pid) { + struct task_struct *p; + +- do_each_task_pid(pgid, PIDTYPE_PGID, p) { +- if (p->signal->session == current->signal->session) ++ do_each_task_pid_ve(_pgid, PIDTYPE_PGID, p) { ++ if (p->signal->session == current->signal->session) { ++ pgid = virt_pgid(p); + goto ok_pgid; +- } while_each_task_pid(pgid, PIDTYPE_PGID, p); ++ } ++ } while_each_task_pid_ve(_pgid, PIDTYPE_PGID, p); + goto out; + } + + ok_pgid: +- err = security_task_setpgid(p, pgid); ++ err = security_task_setpgid(p, _pgid); + if (err) + goto out; + + if (process_group(p) != pgid) { + detach_pid(p, PIDTYPE_PGID); +- p->signal->pgrp = pgid; +- attach_pid(p, PIDTYPE_PGID, pgid); ++ p->signal->pgrp = _pgid; ++ set_virt_pgid(p, pgid); ++ attach_pid(p, PIDTYPE_PGID, _pgid); ++ if (atomic_read(&p->signal->count) != 1) { ++ task_t *t; ++ for (t = next_thread(p); t != p; t = next_thread(t)) { ++ set_virt_pgid(t, pgid); ++ } ++ } + } + + err = 0; +@@ -1155,19 +1296,19 @@ out: + asmlinkage long sys_getpgid(pid_t pid) + { + if (!pid) { +- return process_group(current); ++ return virt_pgid(current); + } else { + int retval; + struct task_struct *p; + + read_lock(&tasklist_lock); +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + + retval = -ESRCH; + if (p) { + retval = security_task_getpgid(p); + if (!retval) +- retval = process_group(p); ++ retval = virt_pgid(p); + } + read_unlock(&tasklist_lock); + return retval; +@@ -1179,7 +1320,7 @@ asmlinkage long sys_getpgid(pid_t pid) + asmlinkage long sys_getpgrp(void) + { + /* SMP - assuming writes are word atomic this is fine */ +- return process_group(current); ++ return virt_pgid(current); + } + + #endif +@@ -1187,19 +1328,19 @@ asmlinkage long sys_getpgrp(void) + asmlinkage long sys_getsid(pid_t pid) + { + if (!pid) { +- return current->signal->session; ++ return virt_sid(current); + } else { + int retval; + struct task_struct *p; + + read_lock(&tasklist_lock); +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + + retval = -ESRCH; + if(p) { + retval = security_task_getsid(p); + if (!retval) +- retval = p->signal->session; ++ retval = virt_sid(p); + } + read_unlock(&tasklist_lock); + return retval; +@@ -1223,9 +1364,19 @@ asmlinkage long sys_setsid(void) + + current->signal->leader = 1; + __set_special_pids(current->pid, current->pid); ++ set_virt_pgid(current, virt_pid(current)); ++ set_virt_sid(current, virt_pid(current)); + current->signal->tty = NULL; + current->signal->tty_old_pgrp = 0; +- err = process_group(current); ++ if (atomic_read(¤t->signal->count) != 1) { ++ task_t *t; ++ for (t = next_thread(current); t != current; t = next_thread(t)) { ++ set_virt_pgid(t, virt_pid(current)); ++ set_virt_sid(t, virt_pid(current)); ++ } ++ } ++ ++ err = virt_pgid(current); + out: + write_unlock_irq(&tasklist_lock); + up(&tty_sem); +@@ -1505,7 +1656,7 @@ asmlinkage long sys_newuname(struct new_ + int errno = 0; + + down_read(&uts_sem); +- if (copy_to_user(name,&system_utsname,sizeof *name)) ++ if (copy_to_user(name,&ve_utsname,sizeof *name)) + errno = -EFAULT; + up_read(&uts_sem); + return errno; +@@ -1516,15 +1667,15 @@ asmlinkage long sys_sethostname(char __u + int errno; + char tmp[__NEW_UTS_LEN]; + +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) + return -EINVAL; + down_write(&uts_sem); + errno = -EFAULT; + if (!copy_from_user(tmp, name, len)) { +- memcpy(system_utsname.nodename, tmp, len); +- system_utsname.nodename[len] = 0; ++ memcpy(ve_utsname.nodename, tmp, len); ++ ve_utsname.nodename[len] = 0; + errno = 0; + } + up_write(&uts_sem); +@@ -1540,11 +1691,11 @@ asmlinkage long sys_gethostname(char __u + if (len < 0) + return -EINVAL; + down_read(&uts_sem); +- i = 1 + strlen(system_utsname.nodename); ++ i = 1 + strlen(ve_utsname.nodename); + if (i > len) + i = len; + errno = 0; +- if (copy_to_user(name, system_utsname.nodename, i)) ++ if (copy_to_user(name, ve_utsname.nodename, i)) + errno = -EFAULT; + up_read(&uts_sem); + return errno; +@@ -1561,7 +1712,7 @@ asmlinkage long sys_setdomainname(char _ + int errno; + char tmp[__NEW_UTS_LEN]; + +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) + return -EINVAL; +@@ -1569,8 +1720,8 @@ asmlinkage long sys_setdomainname(char _ + down_write(&uts_sem); + errno = -EFAULT; + if (!copy_from_user(tmp, name, len)) { +- memcpy(system_utsname.domainname, tmp, len); +- system_utsname.domainname[len] = 0; ++ memcpy(ve_utsname.domainname, tmp, len); ++ ve_utsname.domainname[len] = 0; + errno = 0; + } + up_write(&uts_sem); +diff -uprN linux-2.6.15.orig/kernel/sysctl.c linux-2.6.15-ve025stab014/kernel/sysctl.c +--- linux-2.6.15.orig/kernel/sysctl.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/sysctl.c 2006-01-27 14:48:08.000000000 +0300 +@@ -25,6 +25,8 @@ + #include <linux/slab.h> + #include <linux/sysctl.h> + #include <linux/proc_fs.h> ++#include <linux/ve_owner.h> ++#include <linux/ve.h> + #include <linux/ctype.h> + #include <linux/utsname.h> + #include <linux/capability.h> +@@ -67,6 +69,12 @@ extern int min_free_kbytes; + extern int printk_ratelimit_jiffies; + extern int printk_ratelimit_burst; + extern int pid_max_min, pid_max_max; ++#ifdef CONFIG_VE ++int glob_virt_pids = 1; ++EXPORT_SYMBOL(glob_virt_pids); ++#endif ++ ++extern int ve_area_access_check; /* fs/namei.c */ + + #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) + int unknown_nmi_panic; +@@ -128,8 +136,6 @@ int randomize_va_space = 1; + + static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, + ctl_table *, void **); +-static int proc_doutsstring(ctl_table *table, int write, struct file *filp, +- void __user *buffer, size_t *lenp, loff_t *ppos); + + static ctl_table root_table[]; + static struct ctl_table_header root_table_header = +@@ -585,6 +591,16 @@ static ctl_table kern_table[] = { + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, ++#ifdef CONFIG_VE ++ { ++ .ctl_name = KERN_VIRT_PIDS, ++ .procname = "virt_pids", ++ .data = &glob_virt_pids, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif + { + .ctl_name = KERN_PANIC_ON_OOPS, + .procname = "panic_on_oops", +@@ -1046,6 +1062,7 @@ int do_sysctl(int __user *name, int nlen + { + struct list_head *tmp; + int error = -ENOTDIR; ++ struct ve_struct *ve; + + if (nlen <= 0 || nlen >= CTL_MAXNAME) + return -ENOTDIR; +@@ -1054,13 +1071,24 @@ int do_sysctl(int __user *name, int nlen + if (!oldlenp || get_user(old_len, oldlenp)) + return -EFAULT; + } ++ ve = get_exec_env(); + spin_lock(&sysctl_lock); ++#ifdef CONFIG_VE ++ tmp = ve->sysctl_lh.next; ++#else + tmp = &root_table_header.ctl_entry; ++#endif + do { +- struct ctl_table_header *head = +- list_entry(tmp, struct ctl_table_header, ctl_entry); ++ struct ctl_table_header *head; + void *context = NULL; + ++#ifdef CONFIG_VE ++ if (tmp == &ve->sysctl_lh) ++ /* second pass over global variables */ ++ tmp = &root_table_header.ctl_entry; ++#endif ++ ++ head = list_entry(tmp, struct ctl_table_header, ctl_entry); + if (!use_table(head)) + continue; + +@@ -1114,10 +1142,14 @@ static int test_perm(int mode, int op) + static inline int ctl_perm(ctl_table *table, int op) + { + int error; ++ int mode = table->mode; ++ + error = security_sysctl(table, op); + if (error) + return error; +- return test_perm(table->mode, op); ++ if (!ve_accessible(table->owner_env, get_exec_env())) ++ mode &= ~0222; /* disable write access */ ++ return test_perm(mode, op); + } + + static int parse_table(int __user *name, int nlen, +@@ -1283,6 +1315,8 @@ struct ctl_table_header *register_sysctl + int insert_at_head) + { + struct ctl_table_header *tmp; ++ struct list_head *lh; ++ + tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); + if (!tmp) + return NULL; +@@ -1291,17 +1325,52 @@ struct ctl_table_header *register_sysctl + tmp->used = 0; + tmp->unregistering = NULL; + spin_lock(&sysctl_lock); ++#ifdef CONFIG_VE ++ lh = &get_exec_env()->sysctl_lh; ++#else ++ lh = &root_table_header.ctl_entry; ++#endif + if (insert_at_head) +- list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); ++ list_add(&tmp->ctl_entry, lh); + else +- list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); ++ list_add_tail(&tmp->ctl_entry, lh); + spin_unlock(&sysctl_lock); + #ifdef CONFIG_PROC_FS ++#ifdef CONFIG_VE ++ register_proc_table(table, get_exec_env()->proc_sys_root, tmp); ++#else + register_proc_table(table, proc_sys_root, tmp); + #endif ++#endif + return tmp; + } + ++void free_sysctl_clone(ctl_table *clone) ++{ ++ kfree(clone); ++} ++ ++ctl_table *clone_sysctl_template(ctl_table *tmpl, int nr) ++{ ++ int i; ++ ctl_table *clone; ++ ++ clone = kmalloc(nr * sizeof(ctl_table), GFP_KERNEL); ++ if (clone == NULL) ++ return NULL; ++ ++ memcpy(clone, tmpl, nr * sizeof(ctl_table)); ++ for (i = 0; i < nr; i++) { ++ if (tmpl[i].ctl_name == 0) ++ continue; ++ clone[i].owner_env = get_exec_env(); ++ if (tmpl[i].child == NULL) ++ continue; ++ clone[i].child = clone + (tmpl[i].child - tmpl); ++ } ++ return clone; ++} ++ + /** + * unregister_sysctl_table - unregister a sysctl table hierarchy + * @header: the header returned from register_sysctl_table +@@ -1315,8 +1384,12 @@ void unregister_sysctl_table(struct ctl_ + spin_lock(&sysctl_lock); + start_unregistering(header); + #ifdef CONFIG_PROC_FS ++#ifdef CONFIG_VE ++ unregister_proc_table(header->ctl_table, get_exec_env()->proc_sys_root); ++#else + unregister_proc_table(header->ctl_table, proc_sys_root); + #endif ++#endif + spin_unlock(&sysctl_lock); + kfree(header); + } +@@ -1402,11 +1475,6 @@ static void unregister_proc_table(ctl_ta + * its fields. We are under sysctl_lock here. + */ + de->data = NULL; +- +- /* Don't unregister proc entries that are still being used.. */ +- if (atomic_read(&de->count)) +- continue; +- + table->de = NULL; + remove_proc_entry(table->procname, root); + } +@@ -1548,7 +1616,7 @@ int proc_dostring(ctl_table *table, int + * to observe. Should this be in kernel/sys.c ???? + */ + +-static int proc_doutsstring(ctl_table *table, int write, struct file *filp, ++int proc_doutsstring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) + { + int r; +@@ -2123,7 +2191,7 @@ int proc_dostring(ctl_table *table, int + return -ENOSYS; + } + +-static int proc_doutsstring(ctl_table *table, int write, struct file *filp, ++int proc_doutsstring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) + { + return -ENOSYS; +@@ -2182,7 +2250,6 @@ int proc_doulongvec_ms_jiffies_minmax(ct + + #endif /* CONFIG_PROC_FS */ + +- + /* + * General sysctl support routines + */ +@@ -2427,6 +2494,14 @@ void unregister_sysctl_table(struct ctl_ + { + } + ++ctl_table * clone_sysctl_template(ctl_table *tmpl, int nr) ++{ ++ return NULL; ++} ++ ++void free_sysctl_clone(ctl_table *tmpl) ++{ ++} + #endif /* CONFIG_SYSCTL */ + + /* +@@ -2439,6 +2514,7 @@ EXPORT_SYMBOL(proc_dointvec_minmax); + EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); + EXPORT_SYMBOL(proc_dointvec_ms_jiffies); + EXPORT_SYMBOL(proc_dostring); ++EXPORT_SYMBOL(proc_doutsstring); + EXPORT_SYMBOL(proc_doulongvec_minmax); + EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); + EXPORT_SYMBOL(register_sysctl_table); +@@ -2447,3 +2523,5 @@ EXPORT_SYMBOL(sysctl_jiffies); + EXPORT_SYMBOL(sysctl_ms_jiffies); + EXPORT_SYMBOL(sysctl_string); + EXPORT_SYMBOL(unregister_sysctl_table); ++EXPORT_SYMBOL(clone_sysctl_template); ++EXPORT_SYMBOL(free_sysctl_clone); +diff -uprN linux-2.6.15.orig/kernel/timer.c linux-2.6.15-ve025stab014/kernel/timer.c +--- linux-2.6.15.orig/kernel/timer.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/timer.c 2006-01-27 14:48:08.000000000 +0300 +@@ -459,7 +459,11 @@ static inline void __run_timers(tvec_bas + spin_unlock_irq(&base->t_base.lock); + { + int preempt_count = preempt_count(); ++ struct ve_struct *ve; ++ ++ ve = set_exec_env(get_ve0()); + fn(data); ++ (void)set_exec_env(ve); + if (preempt_count != preempt_count()) { + printk(KERN_WARNING "huh, entered %p " + "with preempt_count %08x, exited" +@@ -822,6 +826,23 @@ EXPORT_SYMBOL(avenrun); + * calc_load - given tick count, update the avenrun load estimates. + * This is called while holding a write_lock on xtime_lock. + */ ++ ++static void calc_load_ve(void) ++{ ++ unsigned long flags, nr_unint; ++ ++ nr_unint = nr_uninterruptible() * FIXED_1; ++ spin_lock_irqsave(&kstat_glb_lock, flags); ++ CALC_LOAD(kstat_glob.nr_unint_avg[0], EXP_1, nr_unint); ++ CALC_LOAD(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint); ++ CALC_LOAD(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint); ++ spin_unlock_irqrestore(&kstat_glb_lock, flags); ++ ++#ifdef CONFIG_VE ++ do_update_load_avg_ve(); ++#endif ++} ++ + static inline void calc_load(unsigned long ticks) + { + unsigned long active_tasks; /* fixed-point */ +@@ -834,6 +855,7 @@ static inline void calc_load(unsigned lo + CALC_LOAD(avenrun[0], EXP_1, active_tasks); + CALC_LOAD(avenrun[1], EXP_5, active_tasks); + CALC_LOAD(avenrun[2], EXP_15, active_tasks); ++ calc_load_ve(); + } + } + +@@ -941,7 +963,7 @@ asmlinkage unsigned long sys_alarm(unsig + */ + asmlinkage long sys_getpid(void) + { +- return current->tgid; ++ return virt_tgid(current); + } + + /* +@@ -963,28 +985,15 @@ asmlinkage long sys_getpid(void) + asmlinkage long sys_getppid(void) + { + int pid; +- struct task_struct *me = current; +- struct task_struct *parent; +- +- parent = me->group_leader->real_parent; +- for (;;) { +- pid = parent->tgid; +-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) +-{ +- struct task_struct *old = parent; + +- /* +- * Make sure we read the pid before re-reading the +- * parent pointer: +- */ +- smp_rmb(); +- parent = me->group_leader->real_parent; +- if (old != parent) +- continue; +-} +-#endif +- break; +- } ++ /* Some smart code used to be here. It was wrong. ++ * ->real_parent could be released before dereference and ++ * we accessed freed kernel memory, which faults with debugging on. ++ * Keep it simple and stupid. ++ */ ++ read_lock(&tasklist_lock); ++ pid = virt_tgid(current->group_leader->real_parent); ++ read_unlock(&tasklist_lock); + return pid; + } + +@@ -1115,7 +1124,7 @@ EXPORT_SYMBOL(schedule_timeout_uninterru + /* Thread ID - the internal kernel "pid" */ + asmlinkage long sys_gettid(void) + { +- return current->pid; ++ return virt_pid(current); + } + + static long __sched nanosleep_restart(struct restart_block *restart) +@@ -1183,11 +1192,12 @@ asmlinkage long sys_sysinfo(struct sysin + unsigned long mem_total, sav_total; + unsigned int mem_unit, bitcount; + unsigned long seq; ++ unsigned long *__avenrun; ++ struct timespec tp; + + memset((char *)&val, 0, sizeof(struct sysinfo)); + + do { +- struct timespec tp; + seq = read_seqbegin(&xtime_lock); + + /* +@@ -1204,14 +1214,25 @@ asmlinkage long sys_sysinfo(struct sysin + tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; + tp.tv_sec++; + } +- val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); +- +- val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); +- val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); +- val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); ++ } while (read_seqretry(&xtime_lock, seq)); + ++ if (ve_is_super(get_exec_env())) { ++ val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); ++ __avenrun = &avenrun[0]; + val.procs = nr_threads; +- } while (read_seqretry(&xtime_lock, seq)); ++ } ++#ifdef CONFIG_VE ++ else { ++ struct ve_struct *ve; ++ ve = get_exec_env(); ++ __avenrun = &ve->avenrun[0]; ++ val.procs = atomic_read(&ve->pcounter); ++ val.uptime = tp.tv_sec - ve->start_timespec.tv_sec; ++ } ++#endif ++ val.loads[0] = __avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); ++ val.loads[1] = __avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); ++ val.loads[2] = __avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + + si_meminfo(&val); + si_swapinfo(&val); +diff -uprN linux-2.6.15.orig/kernel/ub/Kconfig linux-2.6.15-ve025stab014/kernel/ub/Kconfig +--- linux-2.6.15.orig/kernel/ub/Kconfig 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/ub/Kconfig 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,89 @@ ++# ++# User resources part (UBC) ++# ++# Copyright (C) 2005 SWsoft ++# All rights reserved. ++# ++# Licensing governed by "linux/COPYING.SWsoft" file. ++ ++menu "User resources" ++ ++config USER_RESOURCE ++ bool "Enable user resource accounting" ++ default y ++ help ++ This patch provides accounting and allows to configure ++ limits for user's consumption of exhaustible system resources. ++ The most important resource controlled by this patch is unswappable ++ memory (either mlock'ed or used by internal kernel structures and ++ buffers). The main goal of this patch is to protect processes ++ from running short of important resources because of an accidental ++ misbehavior of processes or malicious activity aiming to ``kill'' ++ the system. It's worth to mention that resource limits configured ++ by setrlimit(2) do not give an acceptable level of protection ++ because they cover only small fraction of resources and work on a ++ per-process basis. Per-process accounting doesn't prevent malicious ++ users from spawning a lot of resource-consuming processes. ++ ++config USER_RSS_ACCOUNTING ++ bool "Account physical memory usage" ++ default y ++ depends on USER_RESOURCE ++ help ++ This allows to estimate per beancounter physical memory usage. ++ Implemented alghorithm accounts shared pages of memory as well, ++ dividing them by number of beancounter which use the page. ++ ++config USER_SWAP_ACCOUNTING ++ bool "Account swap usage" ++ default y ++ depends on USER_RESOURCE ++ help ++ This allows accounting of swap usage. ++ ++config USER_RESOURCE_PROC ++ bool "Report resource usage in /proc" ++ default y ++ depends on USER_RESOURCE ++ help ++ Allows a system administrator to inspect resource accounts and limits. ++ ++config UBC_DEBUG ++ bool "User resources debug features" ++ default n ++ depends on USER_RESOURCE ++ help ++ Enables to setup debug features for user resource accounting ++ ++config UBC_DEBUG_KMEM ++ bool "Debug kmemsize with cache counters" ++ default n ++ depends on UBC_DEBUG ++ help ++ Adds /proc/user_beancounters_debug entry to get statistics ++ about cache usage of each beancounter ++ ++config UBC_KEEP_UNUSED ++ bool "Keep unused beancounter alive" ++ default y ++ depends on UBC_DEBUG ++ help ++ If on, unused beancounters are kept on the hash and maxheld value ++ can be looked through. ++ ++config UBC_DEBUG_ITEMS ++ bool "Account resources in items rather than in bytes" ++ default y ++ depends on UBC_DEBUG ++ help ++ When true some of the resources (e.g. kmemsize) are accounted ++ in items instead of bytes. ++ ++config UBC_UNLIMITED ++ bool "Use unlimited ubc settings" ++ default y ++ depends on UBC_DEBUG ++ help ++ When ON all limits and barriers are set to max values. ++ ++endmenu +diff -uprN linux-2.6.15.orig/kernel/ub/Makefile linux-2.6.15-ve025stab014/kernel/ub/Makefile +--- linux-2.6.15.orig/kernel/ub/Makefile 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/ub/Makefile 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,20 @@ ++# ++# User resources part (UBC) ++# ++# Copyright (C) 2005 SWsoft ++# All rights reserved. ++# ++# Licensing governed by "linux/COPYING.SWsoft" file. ++ ++obj-y := ub_sys.o ++obj-$(CONFIG_USER_RESOURCE) += beancounter.o ++obj-$(CONFIG_USER_RESOURCE) += ub_dcache.o ++obj-$(CONFIG_USER_RESOURCE) += ub_mem.o ++obj-$(CONFIG_USER_RESOURCE) += ub_misc.o ++obj-$(CONFIG_USER_RESOURCE) += ub_net.o ++obj-$(CONFIG_USER_RESOURCE) += ub_pages.o ++obj-$(CONFIG_USER_RESOURCE) += ub_stat.o ++obj-$(CONFIG_USER_RESOURCE) += ub_oom.o ++ ++obj-$(CONFIG_USER_RSS_ACCOUNTING) += ub_page_bc.o ++obj-$(CONFIG_USER_RESOURCE_PROC) += ub_proc.o +diff -uprN linux-2.6.15.orig/kernel/ub/beancounter.c linux-2.6.15-ve025stab014/kernel/ub/beancounter.c +--- linux-2.6.15.orig/kernel/ub/beancounter.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/ub/beancounter.c 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,675 @@ ++/* ++ * linux/kernel/ub/beancounter.c ++ * ++ * Copyright (C) 1998 Alan Cox ++ * 1998-2000 Andrey V. Savochkin <saw@saw.sw.com.sg> ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * TODO: ++ * - more intelligent limit check in mremap(): currently the new size is ++ * charged and _then_ old size is uncharged ++ * (almost done: !move_vma case is completely done, ++ * move_vma in its current implementation requires too many conditions to ++ * do things right, because it may be not only expansion, but shrinking ++ * also, plus do_munmap will require an additional parameter...) ++ * - problem: bad pmd page handling ++ * - consider /proc redesign ++ * - TCP/UDP ports ++ * + consider whether __charge_beancounter_locked should be inline ++ * ++ * Changes: ++ * 1999/08/17 Marcelo Tosatti <marcelo@conectiva.com.br> ++ * - Set "barrier" and "limit" parts of limits atomically. ++ * 1999/10/06 Marcelo Tosatti <marcelo@conectiva.com.br> ++ * - setublimit system call. ++ */ ++ ++#include <linux/slab.h> ++#include <linux/module.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_hash.h> ++#include <ub/ub_vmpages.h> ++ ++static kmem_cache_t *ub_cachep; ++static struct user_beancounter default_beancounter; ++struct user_beancounter ub0; ++ ++const char *ub_rnames[] = { ++ "kmemsize", /* 0 */ ++ "lockedpages", ++ "privvmpages", ++ "shmpages", ++ "dummy", ++ "numproc", /* 5 */ ++ "physpages", ++ "vmguarpages", ++ "oomguarpages", ++ "numtcpsock", ++ "numflock", /* 10 */ ++ "numpty", ++ "numsiginfo", ++ "tcpsndbuf", ++ "tcprcvbuf", ++ "othersockbuf", /* 15 */ ++ "dgramrcvbuf", ++ "numothersock", ++ "dcachesize", ++ "numfile", ++ "dummy", /* 20 */ ++ "dummy", ++ "dummy", ++ "numiptent", ++ "unused_privvmpages", /* UB_RESOURCES */ ++ "tmpfs_respages", ++ "swap_pages", ++ "held_pages", ++}; ++ ++static void init_beancounter_struct(struct user_beancounter *ub); ++static void init_beancounter_store(struct user_beancounter *ub); ++static void init_beancounter_nolimits(struct user_beancounter *ub); ++ ++void print_ub_uid(struct user_beancounter *ub, char *buf, int size) ++{ ++ if (ub->parent != NULL) ++ snprintf(buf, size, "%u.%u", ub->parent->ub_uid, ub->ub_uid); ++ else ++ snprintf(buf, size, "%u", ub->ub_uid); ++} ++EXPORT_SYMBOL(print_ub_uid); ++ ++#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1)) ++#define ub_subhash_fun(p, id) ub_hash_fun((p)->ub_uid + (id) * 17) ++struct ub_hash_slot ub_hash[UB_HASH_SIZE]; ++spinlock_t ub_hash_lock; ++EXPORT_SYMBOL(ub_hash); ++EXPORT_SYMBOL(ub_hash_lock); ++ ++/* ++ * Per user resource beancounting. Resources are tied to their luid. ++ * The resource structure itself is tagged both to the process and ++ * the charging resources (a socket doesn't want to have to search for ++ * things at irq time for example). Reference counters keep things in ++ * hand. ++ * ++ * The case where a user creates resource, kills all his processes and ++ * then starts new ones is correctly handled this way. The refcounters ++ * will mean the old entry is still around with resource tied to it. ++ */ ++struct user_beancounter *get_beancounter_byuid(uid_t uid, int create) ++{ ++ struct user_beancounter *new_ub, *ub; ++ unsigned long flags; ++ struct ub_hash_slot *slot; ++ ++ slot = &ub_hash[ub_hash_fun(uid)]; ++ new_ub = NULL; ++ ++retry: ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ ub = slot->ubh_beans; ++ while (ub != NULL && (ub->ub_uid != uid || ub->parent != NULL)) ++ ub = ub->ub_next; ++ ++ if (ub != NULL) { ++ /* found */ ++ get_beancounter(ub); ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ if (new_ub != NULL) ++ kmem_cache_free(ub_cachep, new_ub); ++ return ub; ++ } ++ ++ if (!create) { ++ /* no ub found */ ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return NULL; ++ } ++ ++ if (new_ub != NULL) { ++ /* install new ub */ ++ new_ub->ub_next = slot->ubh_beans; ++ slot->ubh_beans = new_ub; ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return new_ub; ++ } ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ++ /* alloc new ub */ ++ new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, ++ GFP_KERNEL); ++ if (new_ub == NULL) ++ return NULL; ++ ++ ub_debug(UBD_ALLOC, "Creating ub %p in slot %p\n", new_ub, slot); ++ memcpy(new_ub, &default_beancounter, sizeof(*new_ub)); ++ init_beancounter_struct(new_ub); ++ new_ub->ub_uid = uid; ++ goto retry; ++} ++EXPORT_SYMBOL(get_beancounter_byuid); ++ ++struct user_beancounter *get_subbeancounter_byid(struct user_beancounter *p, ++ int id, int create) ++{ ++ struct user_beancounter *new_ub, *ub; ++ unsigned long flags; ++ struct ub_hash_slot *slot; ++ ++ slot = &ub_hash[ub_subhash_fun(p, id)]; ++ new_ub = NULL; ++ ++retry: ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ ub = slot->ubh_beans; ++ while (ub != NULL && (ub->parent != p || ub->ub_uid != id)) ++ ub = ub->ub_next; ++ ++ if (ub != NULL) { ++ /* found */ ++ get_beancounter(ub); ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ if (new_ub != NULL) { ++ put_beancounter(new_ub->parent); ++ kmem_cache_free(ub_cachep, new_ub); ++ } ++ return ub; ++ } ++ ++ if (!create) { ++ /* no ub found */ ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return NULL; ++ } ++ ++ if (new_ub != NULL) { ++ /* install new ub */ ++ get_beancounter(new_ub); ++ new_ub->ub_next = slot->ubh_beans; ++ slot->ubh_beans = new_ub; ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return new_ub; ++ } ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ++ /* alloc new ub */ ++ new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, ++ GFP_KERNEL); ++ if (new_ub == NULL) ++ return NULL; ++ ++ ub_debug(UBD_ALLOC, "Creating sub %p in slot %p\n", new_ub, slot); ++ memset(new_ub, 0, sizeof(*new_ub)); ++ init_beancounter_nolimits(new_ub); ++ init_beancounter_store(new_ub); ++ init_beancounter_struct(new_ub); ++ atomic_set(&new_ub->ub_refcount, 0); ++ new_ub->ub_uid = id; ++ new_ub->parent = get_beancounter(p); ++ goto retry; ++} ++EXPORT_SYMBOL(get_subbeancounter_byid); ++ ++struct user_beancounter *subbeancounter_findcreate(struct user_beancounter *p, ++ int id) ++{ ++ struct user_beancounter *ub; ++ unsigned long flags; ++ struct ub_hash_slot *slot; ++ ++ slot = &ub_hash[ub_subhash_fun(p, id)]; ++ ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ ub = slot->ubh_beans; ++ while (ub != NULL && (ub->parent != p || ub->ub_uid != id)) ++ ub = ub->ub_next; ++ ++ if (ub != NULL) { ++ /* found */ ++ get_beancounter(ub); ++ goto done; ++ } ++ ++ /* alloc new ub */ ++ /* Can be called from non-atomic contexts. Den */ ++ ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, GFP_ATOMIC); ++ if (ub == NULL) ++ goto done; ++ ++ ub_debug(UBD_ALLOC, "Creating sub %p in slot %p\n", ub, slot); ++ memset(ub, 0, sizeof(*ub)); ++ init_beancounter_nolimits(ub); ++ init_beancounter_store(ub); ++ init_beancounter_struct(ub); ++ atomic_set(&ub->ub_refcount, 0); ++ ub->ub_uid = id; ++ ub->parent = get_beancounter(p); ++ ++ /* install new ub */ ++ get_beancounter(ub); ++ ub->ub_next = slot->ubh_beans; ++ slot->ubh_beans = ub; ++ ++done: ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return ub; ++} ++EXPORT_SYMBOL(subbeancounter_findcreate); ++#ifndef CONFIG_UBC_KEEP_UNUSED ++ ++static int verify_res(struct user_beancounter *ub, int resource, ++ unsigned long held) ++{ ++ char id[64]; ++ ++ if (likely(held == 0)) ++ return 1; ++ ++ print_ub_uid(ub, id, sizeof(id)); ++ printk(KERN_WARNING "Ub %s helds %lu in %s on put\n", ++ id, held, ub_rnames[resource]); ++ return 0; ++} ++ ++static inline void verify_held(struct user_beancounter *ub) ++{ ++ int i, clean; ++ ++ clean = 1; ++ for (i = 0; i < UB_RESOURCES; i++) ++ clean &= verify_res(ub, i, ub->ub_parms[i].held); ++ ++ clean &= verify_res(ub, UB_UNUSEDPRIVVM, ub->ub_unused_privvmpages); ++ clean &= verify_res(ub, UB_TMPFSPAGES, ub->ub_tmpfs_respages); ++ clean &= verify_res(ub, UB_SWAPPAGES, ub->ub_swap_pages); ++ clean &= verify_res(ub, UB_HELDPAGES, (unsigned long)ub->ub_held_pages); ++ ++ ub_debug_trace(!clean, 5, 60*HZ); ++} ++ ++static void __unhash_beancounter(struct user_beancounter *ub) ++{ ++ struct user_beancounter **ubptr; ++ struct ub_hash_slot *slot; ++ ++ if (ub->parent != NULL) ++ slot = &ub_hash[ub_subhash_fun(ub->parent, ub->ub_uid)]; ++ else ++ slot = &ub_hash[ub_hash_fun(ub->ub_uid)]; ++ ubptr = &slot->ubh_beans; ++ ++ while (*ubptr != NULL) { ++ if (*ubptr == ub) { ++ verify_held(ub); ++ *ubptr = ub->ub_next; ++ return; ++ } ++ ubptr = &((*ubptr)->ub_next); ++ } ++ printk(KERN_ERR "Invalid beancounter %p, luid=%d on free, slot %p\n", ++ ub, ub->ub_uid, slot); ++} ++#endif ++ ++void __put_beancounter(struct user_beancounter *ub) ++{ ++ unsigned long flags; ++ struct user_beancounter *parent; ++ ++again: ++ parent = ub->parent; ++ ub_debug(UBD_ALLOC, "__put bc %p (cnt %d) for %.20s pid %d " ++ "cur %08lx cpu %d.\n", ++ ub, atomic_read(&ub->ub_refcount), ++ current->comm, current->pid, ++ (unsigned long)current, smp_processor_id()); ++ ++ /* equevalent to atomic_dec_and_lock_irqsave() */ ++ local_irq_save(flags); ++ if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) { ++ if (unlikely(atomic_read(&ub->ub_refcount) < 0)) ++ printk(KERN_ERR "UB: Bad ub refcount: ub=%p, " ++ "luid=%d, ref=%d\n", ++ ub, ub->ub_uid, ++ atomic_read(&ub->ub_refcount)); ++ local_irq_restore(flags); ++ return; ++ } ++ ++ if (unlikely(ub == get_ub0())) { ++ printk(KERN_ERR "Trying to put ub0\n"); ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return; ++ } ++ ++#ifndef CONFIG_UBC_KEEP_UNUSED ++ __unhash_beancounter(ub); ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ub_free_counters(ub); ++ kmem_cache_free(ub_cachep, ub); ++#else ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++#endif ++ ub = parent; ++ if (ub != NULL) ++ goto again; ++} ++EXPORT_SYMBOL(__put_beancounter); ++ ++/* ++ * Generic resource charging stuff ++ */ ++ ++int __charge_beancounter_locked(struct user_beancounter *ub, ++ int resource, unsigned long val, enum severity strict) ++{ ++ ub_debug_resource(resource, "Charging %lu for %d of %p with %lu\n", ++ val, resource, ub, ub->ub_parms[resource].held); ++ /* ++ * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition ++ * at the moment is possible so an overflow is impossible. ++ */ ++ ub->ub_parms[resource].held += val; ++ ++ switch (strict) { ++ case UB_HARD: ++ if (ub->ub_parms[resource].held > ++ ub->ub_parms[resource].barrier) ++ break; ++ case UB_SOFT: ++ if (ub->ub_parms[resource].held > ++ ub->ub_parms[resource].limit) ++ break; ++ case UB_FORCE: ++ ub_adjust_maxheld(ub, resource); ++ return 0; ++ default: ++ BUG(); ++ } ++ ++ if (strict == UB_SOFT && ub_ratelimit(&ub->ub_limit_rl)) ++ printk(KERN_INFO "Fatal resource shortage: %s, UB %d.\n", ++ ub_rnames[resource], ub->ub_uid); ++ ub->ub_parms[resource].failcnt++; ++ ub->ub_parms[resource].held -= val; ++ return -ENOMEM; ++} ++ ++int charge_beancounter(struct user_beancounter *ub, ++ int resource, unsigned long val, enum severity strict) ++{ ++ int retval; ++ struct user_beancounter *p, *q; ++ unsigned long flags; ++ ++ retval = -EINVAL; ++ if (val > UB_MAXVALUE) ++ goto out; ++ ++ local_irq_save(flags); ++ for (p = ub; p != NULL; p = p->parent) { ++ spin_lock(&p->ub_lock); ++ retval = __charge_beancounter_locked(p, resource, val, strict); ++ spin_unlock(&p->ub_lock); ++ if (retval) ++ goto unroll; ++ } ++out_restore: ++ local_irq_restore(flags); ++out: ++ return retval; ++ ++unroll: ++ for (q = ub; q != p; q = q->parent) { ++ spin_lock(&q->ub_lock); ++ __uncharge_beancounter_locked(q, resource, val); ++ spin_unlock(&q->ub_lock); ++ } ++ goto out_restore; ++} ++ ++EXPORT_SYMBOL(charge_beancounter); ++ ++void charge_beancounter_notop(struct user_beancounter *ub, ++ int resource, unsigned long val) ++{ ++ struct user_beancounter *p; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ for (p = ub; p->parent != NULL; p = p->parent) { ++ spin_lock(&p->ub_lock); ++ __charge_beancounter_locked(p, resource, val, UB_FORCE); ++ spin_unlock(&p->ub_lock); ++ } ++ local_irq_restore(flags); ++} ++ ++EXPORT_SYMBOL(charge_beancounter_notop); ++ ++void uncharge_warn(struct user_beancounter *ub, int resource, ++ unsigned long val, unsigned long held) ++{ ++ char id[64]; ++ ++ print_ub_uid(ub, id, sizeof(id)); ++ printk(KERN_ERR "Uncharging too much %lu h %lu, res %s ub %s\n", ++ val, held, ub_rnames[resource], id); ++ ub_debug_trace(1, 10, 10*HZ); ++} ++ ++void __uncharge_beancounter_locked(struct user_beancounter *ub, ++ int resource, unsigned long val) ++{ ++ ub_debug_resource(resource, "Uncharging %lu for %d of %p with %lu\n", ++ val, resource, ub, ub->ub_parms[resource].held); ++ if (ub->ub_parms[resource].held < val) { ++ uncharge_warn(ub, resource, ++ val, ub->ub_parms[resource].held); ++ val = ub->ub_parms[resource].held; ++ } ++ ub->ub_parms[resource].held -= val; ++} ++ ++void uncharge_beancounter(struct user_beancounter *ub, ++ int resource, unsigned long val) ++{ ++ unsigned long flags; ++ struct user_beancounter *p; ++ ++ for (p = ub; p != NULL; p = p->parent) { ++ spin_lock_irqsave(&p->ub_lock, flags); ++ __uncharge_beancounter_locked(p, resource, val); ++ spin_unlock_irqrestore(&p->ub_lock, flags); ++ } ++} ++ ++EXPORT_SYMBOL(uncharge_beancounter); ++ ++void uncharge_beancounter_notop(struct user_beancounter *ub, ++ int resource, unsigned long val) ++{ ++ struct user_beancounter *p; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ for (p = ub; p->parent != NULL; p = p->parent) { ++ spin_lock(&p->ub_lock); ++ __uncharge_beancounter_locked(p, resource, val); ++ spin_unlock(&p->ub_lock); ++ } ++ local_irq_restore(flags); ++} ++ ++EXPORT_SYMBOL(uncharge_beancounter_notop); ++ ++ ++/* ++ * Rate limiting stuff. ++ */ ++int ub_ratelimit(struct ub_rate_info *p) ++{ ++ unsigned long cjif, djif; ++ unsigned long flags; ++ static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; ++ long new_bucket; ++ ++ spin_lock_irqsave(&ratelimit_lock, flags); ++ cjif = jiffies; ++ djif = cjif - p->last; ++ if (djif < p->interval) { ++ if (p->bucket >= p->burst) { ++ spin_unlock_irqrestore(&ratelimit_lock, flags); ++ return 0; ++ } ++ p->bucket++; ++ } else { ++ new_bucket = p->bucket - (djif / (unsigned)p->interval); ++ if (new_bucket < 0) ++ new_bucket = 0; ++ p->bucket = new_bucket + 1; ++ } ++ p->last = cjif; ++ spin_unlock_irqrestore(&ratelimit_lock, flags); ++ return 1; ++} ++EXPORT_SYMBOL(ub_ratelimit); ++ ++ ++/* ++ * Initialization ++ * ++ * struct user_beancounter contains ++ * - limits and other configuration settings, ++ * with a copy stored for accounting purposes, ++ * - structural fields: lists, spinlocks and so on. ++ * ++ * Before these parts are initialized, the structure should be memset ++ * to 0 or copied from a known clean structure. That takes care of a lot ++ * of fields not initialized explicitly. ++ */ ++ ++static void init_beancounter_struct(struct user_beancounter *ub) ++{ ++ ub->ub_magic = UB_MAGIC; ++ atomic_set(&ub->ub_refcount, 1); ++ spin_lock_init(&ub->ub_lock); ++ INIT_LIST_HEAD(&ub->ub_tcp_sk_list); ++ INIT_LIST_HEAD(&ub->ub_other_sk_list); ++#ifdef CONFIG_UBC_DEBUG_KMEM ++ INIT_LIST_HEAD(&ub->ub_cclist); ++#endif ++} ++ ++static void init_beancounter_store(struct user_beancounter *ub) ++{ ++ int k; ++ ++ for (k = 0; k < UB_RESOURCES; k++) { ++ memcpy(&ub->ub_store[k], &ub->ub_parms[k], ++ sizeof(struct ubparm)); ++ } ++} ++ ++static void init_beancounter_nolimits(struct user_beancounter *ub) ++{ ++ int k; ++ ++ for (k = 0; k < UB_RESOURCES; k++) { ++ ub->ub_parms[k].limit = UB_MAXVALUE; ++ /* FIXME: whether this is right for physpages and guarantees? */ ++ ub->ub_parms[k].barrier = UB_MAXVALUE; ++ } ++ ++ /* FIXME: set unlimited rate? */ ++ ub->ub_limit_rl.burst = 4; ++ ub->ub_limit_rl.interval = 300*HZ; ++} ++ ++static void init_beancounter_syslimits(struct user_beancounter *ub, ++ unsigned long mp) ++{ ++ extern int max_threads; ++ int k; ++ ++ ub->ub_parms[UB_KMEMSIZE].limit = ++ mp > (192*1024*1024 >> PAGE_SHIFT) ? ++ 32*1024*1024 : (mp << PAGE_SHIFT) / 6; ++ ub->ub_parms[UB_LOCKEDPAGES].limit = 8; ++ ub->ub_parms[UB_PRIVVMPAGES].limit = UB_MAXVALUE; ++ ub->ub_parms[UB_SHMPAGES].limit = 64; ++ ub->ub_parms[UB_NUMPROC].limit = max_threads / 2; ++ ub->ub_parms[UB_NUMTCPSOCK].limit = 1024; ++ ub->ub_parms[UB_TCPSNDBUF].limit = 1024*4*1024; /* 4k per socket */ ++ ub->ub_parms[UB_TCPRCVBUF].limit = 1024*6*1024; /* 6k per socket */ ++ ub->ub_parms[UB_NUMOTHERSOCK].limit = 256; ++ ub->ub_parms[UB_DGRAMRCVBUF].limit = 256*4*1024; /* 4k per socket */ ++ ub->ub_parms[UB_OTHERSOCKBUF].limit = 256*8*1024; /* 8k per socket */ ++ ub->ub_parms[UB_NUMFLOCK].limit = 1024; ++ ub->ub_parms[UB_NUMPTY].limit = 16; ++ ub->ub_parms[UB_NUMSIGINFO].limit = 1024; ++ ub->ub_parms[UB_DCACHESIZE].limit = 1024*1024; ++ ub->ub_parms[UB_NUMFILE].limit = 1024; ++ ++ for (k = 0; k < UB_RESOURCES; k++) ++ ub->ub_parms[k].barrier = ub->ub_parms[k].limit; ++ ++ ub->ub_limit_rl.burst = 4; ++ ub->ub_limit_rl.interval = 300*HZ; ++} ++ ++void __init ub_init_ub0(void) ++{ ++ struct user_beancounter *ub; ++ ++ init_cache_counters(); ++ ub = get_ub0(); ++ memset(ub, 0, sizeof(*ub)); ++ ub->ub_uid = 0; ++ init_beancounter_nolimits(ub); ++ init_beancounter_store(ub); ++ init_beancounter_struct(ub); ++ ++ memset(¤t->task_bc, 0, sizeof(struct task_beancounter)); ++ (void)set_exec_ub(get_ub0()); ++ current->task_bc.fork_sub = get_beancounter(get_ub0()); ++ init_mm.mm_ub = get_beancounter(ub); ++} ++ ++void __init ub_hash_init(void) ++{ ++ struct ub_hash_slot *slot; ++ ++ spin_lock_init(&ub_hash_lock); ++ /* insert ub0 into the hash */ ++ slot = &ub_hash[ub_hash_fun(get_ub0()->ub_uid)]; ++ slot->ubh_beans = get_ub0(); ++} ++ ++void __init ub_init_cache(unsigned long mempages) ++{ ++ extern int skbc_cache_init(void); ++ int res; ++ ++ res = skbc_cache_init(); ++ ub_cachep = kmem_cache_create("user_beancounters", ++ sizeof(struct user_beancounter), ++ 0, SLAB_HWCACHE_ALIGN, NULL, NULL); ++ if (res < 0 || ub_cachep == NULL) ++ panic("Can't create ubc caches\n"); ++ ++ memset(&default_beancounter, 0, sizeof(default_beancounter)); ++#ifdef CONFIG_UBC_UNLIMITED ++ init_beancounter_nolimits(&default_beancounter); ++#else ++ init_beancounter_syslimits(&default_beancounter, mempages); ++#endif ++ init_beancounter_store(&default_beancounter); ++ init_beancounter_struct(&default_beancounter); ++ ++ ub_hash_init(); ++} +diff -uprN linux-2.6.15.orig/kernel/ub/ub_dcache.c linux-2.6.15-ve025stab014/kernel/ub/ub_dcache.c +--- linux-2.6.15.orig/kernel/ub/ub_dcache.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/ub/ub_dcache.c 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,325 @@ ++/* ++ * kernel/ub/ub_dcache.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/dcache.h> ++#include <linux/slab.h> ++#include <linux/kmem_cache.h> ++#include <linux/fs.h> ++#include <linux/err.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_mem.h> ++#include <ub/ub_dcache.h> ++ ++/* ++ * Locking ++ * traverse dcache_lock d_lock ++ * ub_dentry_charge + + + ++ * ub_dentry_uncharge + - + ++ * ub_dentry_charge_nofail + + - ++ * ++ * d_inuse is atomic so that we can inc dentry's parent d_inuse in ++ * ub_dentry_charhe with the only dentry's d_lock held. ++ * ++ * Race in uncharge vs charge_nofail is handled with dcache_lock. ++ * Race in charge vs charge_nofail is inessential since they both inc d_inuse. ++ * Race in uncharge vs charge is handled by altering d_inuse under d_lock. ++ * ++ * Race with d_move is handled this way: ++ * - charge_nofail and uncharge are protected by dcache_lock; ++ * - charge works only with dentry and dentry->d_parent->d_inuse, so ++ * it's enough to lock only the dentry. ++ */ ++ ++/* ++ * Beancounting ++ * UB argument must NOT be NULL ++ */ ++ ++static int do_charge_dcache(struct user_beancounter *ub, unsigned long size, ++ enum severity sv) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (__charge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size), sv)) ++ goto out_mem; ++ if (__charge_beancounter_locked(ub, UB_DCACHESIZE, size, sv)) ++ goto out_dcache; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return 0; ++ ++out_dcache: ++ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size)); ++out_mem: ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return -ENOMEM; ++} ++ ++static void do_uncharge_dcache(struct user_beancounter *ub, ++ unsigned long size) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size)); ++ __uncharge_beancounter_locked(ub, UB_DCACHESIZE, size); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++static int charge_dcache(struct user_beancounter *ub, unsigned long size, ++ enum severity sv) ++{ ++ struct user_beancounter *p, *q; ++ ++ for (p = ub; p != NULL; p = p->parent) { ++ if (do_charge_dcache(p, size, sv)) ++ goto unroll; ++ } ++ return 0; ++ ++unroll: ++ for (q = ub; q != p; q = q->parent) ++ do_uncharge_dcache(q, size); ++ return -ENOMEM; ++} ++ ++void uncharge_dcache(struct user_beancounter *ub, unsigned long size) ++{ ++ for (; ub != NULL; ub = ub->parent) ++ do_uncharge_dcache(ub, size); ++} ++ ++static inline void charge_dcache_forced(struct user_beancounter *ub, ++ unsigned long size) ++{ ++ charge_dcache(ub, size, UB_FORCE); ++} ++ ++static inline void d_forced_charge(struct dentry_beancounter *d_bc) ++{ ++ d_bc->d_ub = get_beancounter(get_exec_ub()); ++ if (d_bc->d_ub == NULL) ++ return; ++ ++ charge_dcache_forced(d_bc->d_ub, d_bc->d_ubsize); ++} ++ ++static inline void d_uncharge(struct dentry_beancounter *d_bc) ++{ ++ if (d_bc->d_ub == NULL) ++ return; ++ ++ uncharge_dcache(d_bc->d_ub, d_bc->d_ubsize); ++ put_beancounter(d_bc->d_ub); ++ d_bc->d_ub = NULL; ++} ++ ++/* ++ * Alloc / free dentry_beancounter ++ */ ++ ++static inline int d_alloc_beancounter(struct dentry *d) ++{ ++ return 0; ++} ++ ++static inline void d_free_beancounter(struct dentry_beancounter *d_bc) ++{ ++} ++ ++static inline unsigned long d_charge_size(struct dentry *dentry) ++{ ++ /* dentry's d_name is already set to appropriate value (see d_alloc) */ ++ return inode_cachep->objuse + dentry_cache->objuse + ++ (dname_external(dentry) ? ++ kmem_obj_memusage((void *)dentry->d_name.name) : 0); ++} ++ ++/* ++ * dentry mark in use operation ++ * d_lock is held ++ */ ++ ++static int d_inc_inuse(struct dentry *dentry) ++{ ++ struct user_beancounter *ub; ++ struct dentry_beancounter *d_bc; ++ ++ if (dentry != dentry->d_parent) { ++ struct dentry *parent; ++ ++ /* ++ * Increment d_inuse of parent. ++ * It can't change since dentry->d_lock is held. ++ */ ++ parent = dentry->d_parent; ++ if (ub_dget_testone(parent)) ++ BUG(); ++ } ++ ++ d_bc = &dentry->dentry_bc; ++ ub = get_beancounter(get_exec_ub()); ++ ++ if (ub != NULL && charge_dcache(ub, d_bc->d_ubsize, UB_SOFT)) ++ goto out_err; ++ ++ d_bc->d_ub = ub; ++ return 0; ++ ++out_err: ++ put_beancounter(ub); ++ d_bc->d_ub = NULL; ++ return -ENOMEM; ++} ++ ++/* ++ * no locks ++ */ ++int ub_dentry_alloc(struct dentry *dentry) ++{ ++ int err; ++ struct dentry_beancounter *d_bc; ++ ++ err = d_alloc_beancounter(dentry); ++ if (err < 0) ++ return err; ++ ++ d_bc = &dentry->dentry_bc; ++ d_bc->d_ub = get_beancounter(get_exec_ub()); ++ atomic_set(&d_bc->d_inuse, INUSE_INIT); /* see comment in ub_dcache.h */ ++ d_bc->d_ubsize = d_charge_size(dentry); ++ ++ err = 0; ++ if (d_bc->d_ub != NULL && ++ charge_dcache(d_bc->d_ub, d_bc->d_ubsize, UB_HARD)) { ++ put_beancounter(d_bc->d_ub); ++ d_free_beancounter(d_bc); ++ err = -ENOMEM; ++ } ++ ++ return err; ++} ++ ++/* ++ * Charge / uncharge functions. ++ * ++ * We take d_lock to protect dentry_bc from concurrent acces ++ * when simultaneous __d_lookup and d_put happens on one dentry. ++ */ ++ ++/* ++ * no dcache_lock, d_lock and rcu_read_lock are held ++ * drops d_lock, rcu_read_lock and returns error if any ++ */ ++int ub_dentry_charge(struct dentry *dentry) ++{ ++ int err; ++ ++ err = 0; ++ if (ub_dget_testone(dentry)) ++ err = d_inc_inuse(dentry); ++ ++ /* ++ * d_lock and rcu_read_lock are dropped here ++ * (see also __d_lookup) ++ */ ++ spin_unlock(&dentry->d_lock); ++ rcu_read_unlock(); ++ ++ if (!err) ++ return 0; ++ ++ /* ++ * d_invlaidate is required for real_lookup ++ * since it tries to create new dentry on ++ * d_lookup failure. ++ */ ++ if (!d_invalidate(dentry)) ++ return err; ++ ++ /* didn't succeeded, force dentry to be charged */ ++ d_forced_charge(&dentry->dentry_bc); ++ return 0; ++} ++ ++/* ++ * dcache_lock is held ++ * no d_locks, sequentaly takes and drops from dentry upward ++ */ ++void ub_dentry_uncharge(struct dentry *dentry) ++{ ++ struct dentry *parent; ++ ++ /* go up until status is changed and root is not reached */ ++ while (1) { ++ /* ++ * We need d_lock here to handle ++ * the race with ub_dentry_charge ++ */ ++ spin_lock(&dentry->d_lock); ++ if (!ub_dput_testzero(dentry)) { ++ spin_unlock(&dentry->d_lock); ++ break; ++ } ++ ++ /* state transition 0 => -1 */ ++ d_uncharge(&dentry->dentry_bc); ++ parent = dentry->d_parent; ++ spin_unlock(&dentry->d_lock); ++ ++ /* ++ * dcache_lock is held (see comment in __dget_locked) ++ * so we can safely move upwards. ++ */ ++ if (dentry == parent) ++ break; ++ dentry = parent; ++ } ++} ++ ++/* ++ * forced version. for dget in clean cache, when error is not an option ++ * ++ * dcache_lock is held ++ * no d_locks ++ */ ++void ub_dentry_charge_nofail(struct dentry *dentry) ++{ ++ struct dentry *parent; ++ ++ /* go up until status is changed and root is not reached */ ++ while (1) { ++ if (!ub_dget_testone(dentry)) ++ break; ++ ++ /* ++ * state transition -1 => 0 ++ * ++ * No need to lock dentry before atomic_inc ++ * like we do in ub_dentry_uncharge. ++ * We can't race with ub_dentry_uncharge due ++ * to dcache_lock. The only possible race with ++ * ub_dentry_charge is OK since they both ++ * do atomic_inc. ++ */ ++ d_forced_charge(&dentry->dentry_bc); ++ /* ++ * dcache_lock is held (see comment in __dget_locked) ++ * so we can safely move upwards. ++ */ ++ parent = dentry->d_parent; ++ ++ if (dentry == parent) ++ break; ++ dentry = parent; ++ } ++} +diff -uprN linux-2.6.15.orig/kernel/ub/ub_mem.c linux-2.6.15-ve025stab014/kernel/ub/ub_mem.c +--- linux-2.6.15.orig/kernel/ub/ub_mem.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/ub/ub_mem.c 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,388 @@ ++/* ++ * kernel/ub/ub_mem.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/slab.h> ++#include <linux/kmem_cache.h> ++#include <linux/kmem_slab.h> ++#include <linux/highmem.h> ++#include <linux/vmalloc.h> ++#include <linux/mm.h> ++#include <linux/gfp.h> ++#include <linux/swap.h> ++#include <linux/spinlock.h> ++#include <linux/sched.h> ++#include <linux/module.h> ++#include <ub/beancounter.h> ++#include <ub/ub_mem.h> ++#include <ub/ub_hash.h> ++ ++/* ++ * Initialization ++ */ ++ ++/* ++ * Slab accounting ++ */ ++ ++#ifdef CONFIG_UBC_DEBUG_KMEM ++ ++#define CC_HASH_SIZE 1024 ++static struct ub_cache_counter *cc_hash[CC_HASH_SIZE]; ++spinlock_t cc_lock; ++ ++static void __free_cache_counters(struct user_beancounter *ub, ++ kmem_cache_t *cachep) ++{ ++ struct ub_cache_counter *cc, **pprev, *del; ++ int i; ++ unsigned long flags; ++ ++ del = NULL; ++ spin_lock_irqsave(&cc_lock, flags); ++ for (i = 0; i < CC_HASH_SIZE; i++) { ++ pprev = &cc_hash[i]; ++ cc = cc_hash[i]; ++ while (cc != NULL) { ++ if (cc->ub != ub && cc->cachep != cachep) { ++ pprev = &cc->next; ++ cc = cc->next; ++ continue; ++ } ++ ++ list_del(&cc->ulist); ++ *pprev = cc->next; ++ cc->next = del; ++ del = cc; ++ cc = *pprev; ++ } ++ } ++ spin_unlock_irqrestore(&cc_lock, flags); ++ ++ while (del != NULL) { ++ cc = del->next; ++ kfree(del); ++ del = cc; ++ } ++} ++ ++void ub_free_counters(struct user_beancounter *ub) ++{ ++ __free_cache_counters(ub, NULL); ++} ++ ++void ub_kmemcache_free(kmem_cache_t *cachep) ++{ ++ __free_cache_counters(NULL, cachep); ++} ++ ++void __init init_cache_counters(void) ++{ ++ memset(cc_hash, 0, CC_HASH_SIZE * sizeof(cc_hash[0])); ++ spin_lock_init(&cc_lock); ++} ++ ++#define cc_hash_fun(ub, cachep) ( \ ++ (((unsigned long)(ub) >> L1_CACHE_SHIFT) ^ \ ++ ((unsigned long)(ub) >> (BITS_PER_LONG / 2)) ^ \ ++ ((unsigned long)(cachep) >> L1_CACHE_SHIFT) ^ \ ++ ((unsigned long)(cachep) >> (BITS_PER_LONG / 2)) \ ++ ) & (CC_HASH_SIZE - 1)) ++ ++static int change_slab_charged(struct user_beancounter *ub, void *objp, ++ unsigned long val, int mask) ++{ ++ struct ub_cache_counter *cc, *new_cnt, **pprev; ++ kmem_cache_t *cachep; ++ unsigned long flags; ++ ++ cachep = GET_PAGE_CACHE(virt_to_page(objp)); ++ new_cnt = NULL; ++ ++again: ++ spin_lock_irqsave(&cc_lock, flags); ++ cc = cc_hash[cc_hash_fun(ub, cachep)]; ++ while (cc) { ++ if (cc->ub == ub && cc->cachep == cachep) ++ goto found; ++ cc = cc->next; ++ } ++ ++ if (new_cnt != NULL) ++ goto insert; ++ ++ spin_unlock_irqrestore(&cc_lock, flags); ++ ++ new_cnt = kmalloc(sizeof(*new_cnt), mask & ~__GFP_UBC); ++ if (new_cnt == NULL) ++ return -ENOMEM; ++ ++ new_cnt->counter = 0; ++ new_cnt->ub = ub; ++ new_cnt->cachep = cachep; ++ goto again; ++ ++insert: ++ pprev = &cc_hash[cc_hash_fun(ub, cachep)]; ++ new_cnt->next = *pprev; ++ *pprev = new_cnt; ++ list_add(&new_cnt->ulist, &ub->ub_cclist); ++ cc = new_cnt; ++ new_cnt = NULL; ++ ++found: ++ cc->counter += val; ++ spin_unlock_irqrestore(&cc_lock, flags); ++ if (new_cnt) ++ kfree(new_cnt); ++ return 0; ++} ++ ++static inline int inc_slab_charged(struct user_beancounter *ub, ++ void *objp, int mask) ++{ ++ return change_slab_charged(ub, objp, 1, mask); ++} ++ ++static inline void dec_slab_charged(struct user_beancounter *ub, void *objp) ++{ ++ if (change_slab_charged(ub, objp, -1, 0) < 0) ++ BUG(); ++} ++ ++#include <linux/vmalloc.h> ++ ++static inline int inc_pages_charged(struct user_beancounter *ub, ++ struct page *pg, int order) ++{ ++ int cpu; ++ ++ cpu = get_cpu(); ++ ub->ub_stat[cpu].pages_charged++; ++ put_cpu(); ++ return 0; ++} ++ ++static inline void dec_pages_charged(struct user_beancounter *ub, ++ struct page *pg, int order) ++{ ++ int cpu; ++ ++ cpu = get_cpu(); ++ ub->ub_stat[cpu].pages_charged--; ++ put_cpu(); ++} ++ ++void inc_vmalloc_charged(struct vm_struct *vm, int flags) ++{ ++ int cpu; ++ struct user_beancounter *ub; ++ ++ if (!(flags & __GFP_UBC)) ++ return; ++ ++ ub = get_exec_ub(); ++ if (ub == NULL) ++ return; ++ ++ cpu = get_cpu(); ++ ub->ub_stat[cpu].vmalloc_charged += vm->nr_pages; ++ put_cpu(); ++} ++ ++void dec_vmalloc_charged(struct vm_struct *vm) ++{ ++ int cpu; ++ struct user_beancounter *ub; ++ ++ ub = page_ub(vm->pages[0]); ++ if (ub == NULL) ++ return; ++ ++ cpu = get_cpu(); ++ ub->ub_stat[cpu].vmalloc_charged -= vm->nr_pages; ++ put_cpu(); ++} ++ ++#else ++#define inc_slab_charged(ub, o, m) (0) ++#define dec_slab_charged(ub, o) do { } while (0) ++#define inc_pages_charged(ub, pg, o) (0) ++#define dec_pages_charged(ub, pg, o) do { } while (0) ++#endif ++ ++static inline struct user_beancounter **slab_ub_ref(void *objp) ++{ ++ struct page *pg; ++ kmem_cache_t *cachep; ++ struct slab *slabp; ++ int objnr; ++ ++ pg = virt_to_page(objp); ++ cachep = GET_PAGE_CACHE(pg); ++ BUG_ON(!(cachep->flags & SLAB_UBC)); ++ slabp = GET_PAGE_SLAB(pg); ++ objnr = (objp - slabp->s_mem) / cachep->objsize; ++ return slab_ubcs(cachep, slabp) + objnr; ++} ++ ++struct user_beancounter *slab_ub(void *objp) ++{ ++ struct user_beancounter **ub_ref; ++ ++ ub_ref = slab_ub_ref(objp); ++ return *ub_ref; ++} ++ ++EXPORT_SYMBOL(slab_ub); ++ ++static inline int should_charge(void *objp, int flags) ++{ ++ kmem_cache_t *cachep; ++ ++ cachep = GET_PAGE_CACHE(virt_to_page(objp)); ++ if (!(cachep->flags & SLAB_UBC)) ++ return 0; ++ if ((cachep->flags & SLAB_NO_CHARGE) && !(flags & __GFP_UBC)) ++ return 0; ++ return 1; ++} ++ ++#define should_uncharge(objp) should_charge(objp, __GFP_UBC) ++ ++int ub_slab_charge(void *objp, int flags) ++{ ++ unsigned int size; ++ struct user_beancounter *ub; ++ ++ if (!should_charge(objp, flags)) ++ return 0; ++ ++ ub = get_beancounter(get_exec_ub()); ++ if (ub == NULL) ++ return 0; ++ ++ size = CHARGE_SIZE(kmem_obj_memusage(objp)); ++ if (charge_beancounter(ub, UB_KMEMSIZE, size, ++ (flags & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD))) ++ goto out_err; ++ ++ if (inc_slab_charged(ub, objp, flags) < 0) { ++ uncharge_beancounter(ub, UB_KMEMSIZE, size); ++ goto out_err; ++ } ++ *slab_ub_ref(objp) = ub; ++ return 0; ++ ++out_err: ++ put_beancounter(ub); ++ return -ENOMEM; ++} ++ ++void ub_slab_uncharge(void *objp) ++{ ++ unsigned int size; ++ struct user_beancounter **ub_ref; ++ ++ if (!should_uncharge(objp)) ++ return; ++ ++ ub_ref = slab_ub_ref(objp); ++ if (*ub_ref == NULL) ++ return; ++ ++ dec_slab_charged(*ub_ref, objp); ++ size = CHARGE_SIZE(kmem_obj_memusage(objp)); ++ uncharge_beancounter(*ub_ref, UB_KMEMSIZE, size); ++ put_beancounter(*ub_ref); ++ *ub_ref = NULL; ++} ++ ++/* ++ * Pages accounting ++ */ ++ ++inline int ub_page_charge(struct page *page, int order, int mask) ++{ ++ struct user_beancounter *ub; ++ ++ ub = NULL; ++ if (!(mask & __GFP_UBC)) ++ goto out; ++ ++ ub = get_beancounter(get_exec_ub()); ++ if (ub == NULL) ++ goto out; ++ ++ if (charge_beancounter(ub, UB_KMEMSIZE, CHARGE_ORDER(order), ++ (mask & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD))) ++ goto err; ++ if (inc_pages_charged(ub, page, order) < 0) { ++ uncharge_beancounter(ub, UB_KMEMSIZE, CHARGE_ORDER(order)); ++ goto err; ++ } ++out: ++ BUG_ON(page_ub(page) != NULL); ++ page_ub(page) = ub; ++ return 0; ++ ++err: ++ BUG_ON(page_ub(page) != NULL); ++ put_beancounter(ub); ++ return -ENOMEM; ++} ++ ++inline void ub_page_uncharge(struct page *page, int order) ++{ ++ struct user_beancounter *ub; ++ ++ ub = page_ub(page); ++ if (ub == NULL) ++ return; ++ ++ dec_pages_charged(ub, page, order); ++ BUG_ON(ub->ub_magic != UB_MAGIC); ++ uncharge_beancounter(ub, UB_KMEMSIZE, CHARGE_ORDER(order)); ++ put_beancounter(ub); ++ page_ub(page) = NULL; ++} ++ ++/* ++ * takes init_mm.page_table_lock ++ * some outer lock to protect pages from vmalloced area must be held ++ */ ++struct user_beancounter *vmalloc_ub(void *obj) ++{ ++ struct page *pg; ++ ++ spin_lock(&init_mm.page_table_lock); ++ pg = follow_page(NULL, (unsigned long)obj, FOLL_KERN); ++ spin_unlock(&init_mm.page_table_lock); ++ if (pg == NULL) ++ return NULL; ++ ++ return page_ub(pg); ++} ++ ++EXPORT_SYMBOL(vmalloc_ub); ++ ++struct user_beancounter *mem_ub(void *obj) ++{ ++ struct user_beancounter *ub; ++ ++ if ((unsigned long)obj >= VMALLOC_START && ++ (unsigned long)obj < VMALLOC_END) ++ ub = vmalloc_ub(obj); ++ else ++ ub = slab_ub(obj); ++ ++ return ub; ++} ++ ++EXPORT_SYMBOL(mem_ub); +diff -uprN linux-2.6.15.orig/kernel/ub/ub_misc.c linux-2.6.15-ve025stab014/kernel/ub/ub_misc.c +--- linux-2.6.15.orig/kernel/ub/ub_misc.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/ub/ub_misc.c 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,242 @@ ++/* ++ * kernel/ub/ub_misc.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/tty.h> ++#include <linux/tty_driver.h> ++#include <linux/signal.h> ++#include <linux/slab.h> ++#include <linux/fs.h> ++#include <linux/sched.h> ++#include <linux/kmem_cache.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_mem.h> ++ ++/* ++ * Task staff ++ */ ++ ++static void init_task_sub(struct task_struct *tsk, ++ struct task_beancounter *old_bc) ++{ ++ struct task_beancounter *new_bc; ++ struct user_beancounter *sub; ++ ++ new_bc = &tsk->task_bc; ++ sub = old_bc->fork_sub; ++ new_bc->fork_sub = get_beancounter(sub); ++ new_bc->task_fnode = NULL; ++ new_bc->task_freserv = old_bc->task_freserv; ++ old_bc->task_freserv = NULL; ++ memset(&new_bc->task_data, 0, sizeof(new_bc->task_data)); ++} ++ ++int ub_task_charge(struct task_struct *parent, struct task_struct *task) ++{ ++ struct task_beancounter *old_bc; ++ struct task_beancounter *new_bc; ++ struct user_beancounter *ub; ++ ++ old_bc = &parent->task_bc; ++#if 0 ++ if (old_bc->exec_ub == NULL) { ++ /* FIXME: this won't work if task_bc is outside task_struct */ ++ init_task_sub(task, old_bc); ++ return 0; ++ } ++#endif ++ ub = old_bc->fork_sub; ++ ++ if (charge_beancounter(ub, UB_NUMPROC, 1, UB_HARD) < 0) ++ return -ENOMEM; ++ ++ new_bc = &task->task_bc; ++ new_bc->task_ub = get_beancounter(ub); ++ new_bc->exec_ub = get_beancounter(ub); ++ init_task_sub(task, old_bc); ++ return 0; ++} ++ ++void ub_task_uncharge(struct task_struct *task) ++{ ++ struct task_beancounter *task_bc; ++ ++ task_bc = &task->task_bc; ++ if (task_bc->task_ub != NULL) ++ uncharge_beancounter(task_bc->task_ub, UB_NUMPROC, 1); ++ ++ put_beancounter(task_bc->exec_ub); ++ put_beancounter(task_bc->task_ub); ++ put_beancounter(task_bc->fork_sub); ++ /* can't be freed elsewhere, failures possible in the middle of fork */ ++ if (task_bc->task_freserv != NULL) ++ kfree(task_bc->task_freserv); ++ ++ task_bc->exec_ub = (struct user_beancounter *)0xdeadbcbc; ++} ++ ++/* ++ * Files and file locks. ++ */ ++ ++int ub_file_charge(struct file *f) ++{ ++ struct user_beancounter *ub; ++ ++ /* No need to get_beancounter here since it's already got in slab */ ++ ub = slab_ub(f); ++ if (ub == NULL) ++ return 0; ++ ++ return charge_beancounter(ub, UB_NUMFILE, 1, UB_HARD); ++} ++ ++void ub_file_uncharge(struct file *f) ++{ ++ struct user_beancounter *ub; ++ ++ /* Ub will be put in slab */ ++ ub = slab_ub(f); ++ if (ub == NULL) ++ return; ++ ++ uncharge_beancounter(ub, UB_NUMFILE, 1); ++} ++ ++int ub_flock_charge(struct file_lock *fl, int hard) ++{ ++ struct user_beancounter *ub; ++ int err; ++ ++ /* No need to get_beancounter here since it's already got in slab */ ++ ub = slab_ub(fl); ++ if (ub == NULL) ++ return 0; ++ ++ err = charge_beancounter(ub, UB_NUMFLOCK, 1, hard ? UB_HARD : UB_SOFT); ++ if (!err) ++ fl->fl_charged = 1; ++ return err; ++} ++ ++void ub_flock_uncharge(struct file_lock *fl) ++{ ++ struct user_beancounter *ub; ++ ++ /* Ub will be put in slab */ ++ ub = slab_ub(fl); ++ if (ub == NULL || !fl->fl_charged) ++ return; ++ ++ uncharge_beancounter(ub, UB_NUMFLOCK, 1); ++ fl->fl_charged = 0; ++} ++ ++/* ++ * Signal handling ++ */ ++ ++static int do_ub_siginfo_charge(struct user_beancounter *ub, ++ unsigned long size) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (__charge_beancounter_locked(ub, UB_KMEMSIZE, size, UB_HARD)) ++ goto out_kmem; ++ ++ if (__charge_beancounter_locked(ub, UB_NUMSIGINFO, 1, UB_HARD)) ++ goto out_num; ++ ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return 0; ++ ++out_num: ++ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); ++out_kmem: ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return -ENOMEM; ++} ++ ++static void do_ub_siginfo_uncharge(struct user_beancounter *ub, ++ unsigned long size) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); ++ __uncharge_beancounter_locked(ub, UB_NUMSIGINFO, 1); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++int ub_siginfo_charge(struct sigqueue *sq, struct user_beancounter *ub) ++{ ++ unsigned long size; ++ struct user_beancounter *p, *q; ++ ++ size = CHARGE_SIZE(kmem_obj_memusage(sq)); ++ for (p = ub; p != NULL; p = p->parent) { ++ if (do_ub_siginfo_charge(p, size)) ++ goto unroll; ++ } ++ ++ sq->sig_ub = get_beancounter(ub); ++ return 0; ++ ++unroll: ++ for (q = ub; q != p; q = q->parent) ++ do_ub_siginfo_uncharge(q, size); ++ return -ENOMEM; ++} ++ ++void ub_siginfo_uncharge(struct sigqueue *sq) ++{ ++ unsigned long size; ++ struct user_beancounter *ub, *p; ++ ++ p = ub = sq->sig_ub; ++ sq->sig_ub = NULL; ++ size = CHARGE_SIZE(kmem_obj_memusage(sq)); ++ for (; ub != NULL; ub = ub->parent) ++ do_ub_siginfo_uncharge(ub, size); ++ put_beancounter(p); ++} ++ ++/* ++ * PTYs ++ */ ++ ++int ub_pty_charge(struct tty_struct *tty) ++{ ++ struct user_beancounter *ub; ++ int retval; ++ ++ ub = slab_ub(tty); ++ retval = 0; ++ if (ub && tty->driver->subtype == PTY_TYPE_MASTER && ++ !test_bit(TTY_CHARGED, &tty->flags)) { ++ retval = charge_beancounter(ub, UB_NUMPTY, 1, UB_HARD); ++ if (!retval) ++ set_bit(TTY_CHARGED, &tty->flags); ++ } ++ return retval; ++} ++ ++void ub_pty_uncharge(struct tty_struct *tty) ++{ ++ struct user_beancounter *ub; ++ ++ ub = slab_ub(tty); ++ if (ub && tty->driver->subtype == PTY_TYPE_MASTER && ++ test_bit(TTY_CHARGED, &tty->flags)) { ++ uncharge_beancounter(ub, UB_NUMPTY, 1); ++ clear_bit(TTY_CHARGED, &tty->flags); ++ } ++} +diff -uprN linux-2.6.15.orig/kernel/ub/ub_net.c linux-2.6.15-ve025stab014/kernel/ub/ub_net.c +--- linux-2.6.15.orig/kernel/ub/ub_net.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/ub/ub_net.c 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,1041 @@ ++/* ++ * linux/kernel/ub/ub_net.c ++ * ++ * Copyright (C) 1998-2004 Andrey V. Savochkin <saw@saw.sw.com.sg> ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * TODO: ++ * - sizeof(struct inode) charge ++ * = tcp_mem_schedule() feedback based on ub limits ++ * + measures so that one socket won't exhaust all send buffers, ++ * see bug in bugzilla ++ * = sk->socket check for NULL in snd_wakeups ++ * (tcp_write_space checks for NULL itself) ++ * + in tcp_close(), orphaned socket abortion should be based on ubc ++ * resources (same in tcp_out_of_resources) ++ * Beancounter should also have separate orphaned socket counter... ++ * + for rcv, in-order segment should be accepted ++ * if only barrier is exceeded ++ * = tcp_rmem_schedule() feedback based on ub limits ++ * - repair forward_alloc mechanism for receive buffers ++ * It's idea is that some buffer space is pre-charged so that receive fast ++ * path doesn't need to take spinlocks and do other heavy stuff ++ * + tcp_prune_queue actions based on ub limits ++ * + window adjustments depending on available buffers for receive ++ * - window adjustments depending on available buffers for send ++ * + race around usewreserv ++ * + avoid allocating new page for each tiny-gram, see letter from ANK ++ * + rename ub_sock_lock ++ * + sk->sleep wait queue probably can be used for all wakeups, and ++ * sk->ub_wait is unnecessary ++ * + for UNIX sockets, the current algorithm will lead to ++ * UB_UNIX_MINBUF-sized messages only for non-blocking case ++ * - charge for af_packet sockets ++ * + all datagram sockets should be charged to NUMUNIXSOCK ++ * - we do not charge for skb copies and clones staying in device queues ++ * + live-lock if number of sockets is big and buffer limits are small ++ * [diff-ubc-dbllim3] ++ * - check that multiple readers/writers on the same socket won't cause fatal ++ * consequences ++ * - check allocation/charge orders ++ * + There is potential problem with callback_lock. In *snd_wakeup we take ++ * beancounter first, in sock_def_error_report - callback_lock first. ++ * then beancounter. This is not a problem if callback_lock taken ++ * readonly, but anyway... ++ * - SKB_CHARGE_SIZE doesn't include the space wasted by slab allocator ++ * General kernel problems: ++ * - in tcp_sendmsg(), if allocation fails, non-blocking sockets with ASYNC ++ * notification won't get signals ++ * - datagram_poll looks racy ++ * ++ */ ++ ++#include <linux/net.h> ++#include <linux/slab.h> ++#include <linux/kmem_cache.h> ++#include <linux/gfp.h> ++#include <linux/err.h> ++#include <linux/socket.h> ++#include <linux/module.h> ++#include <linux/sched.h> ++ ++#include <net/sock.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_net.h> ++#include <ub/ub_debug.h> ++ ++ ++/* Skb truesize definition. Bad place. Den */ ++ ++static inline int skb_chargesize_head(struct sk_buff *skb) ++{ ++ return skb_charge_size(skb->end - skb->head + ++ sizeof(struct skb_shared_info)); ++} ++ ++int skb_charge_fullsize(struct sk_buff *skb) ++{ ++ int chargesize; ++ struct sk_buff *skbfrag; ++ ++ chargesize = skb_chargesize_head(skb) + ++ PAGE_SIZE * skb_shinfo(skb)->nr_frags; ++ if (likely(skb_shinfo(skb)->frag_list == NULL)) ++ return chargesize; ++ for (skbfrag = skb_shinfo(skb)->frag_list; ++ skbfrag != NULL; ++ skbfrag = skbfrag->next) { ++ chargesize += skb_charge_fullsize(skbfrag); ++ } ++ return chargesize; ++} ++EXPORT_SYMBOL(skb_charge_fullsize); ++ ++static int ub_sock_makewreserv_locked(struct sock *sk, ++ int bufid, int sockid, unsigned long size); ++ ++int __ub_too_many_orphans(struct sock *sk, int count) ++{ ++ struct user_beancounter *ub; ++ ++ if (sock_has_ubc(sk)) { ++ for (ub = sock_bc(sk)->ub; ub->parent != NULL; ub = ub->parent); ++ if (count >= ub->ub_parms[UB_NUMTCPSOCK].barrier >> 2) ++ return 1; ++ } ++ return 0; ++} ++ ++/* ++ * Queueing ++ */ ++ ++static void ub_sock_snd_wakeup(struct user_beancounter *ub) ++{ ++ struct list_head *p; ++ struct sock_beancounter *skbc; ++ struct sock *sk; ++ struct user_beancounter *cub; ++ unsigned long added; ++ ++ while (!list_empty(&ub->ub_other_sk_list)) { ++ p = ub->ub_other_sk_list.next; ++ skbc = list_entry(p, struct sock_beancounter, ub_sock_list); ++ sk = skbc_sock(skbc); ++ ub_debug(UBD_NET_SLEEP, "Found sock to wake up\n"); ++ added = -skbc->poll_reserv; ++ if (ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF, ++ UB_NUMOTHERSOCK, skbc->ub_waitspc)) ++ break; ++ added += skbc->poll_reserv; ++ ++ /* ++ * See comments in ub_tcp_snd_wakeup. ++ * Locking note: both unix_write_space and ++ * sock_def_write_space take callback_lock themselves. ++ * We take it here just to be on the safe side and to ++ * act the same way as ub_tcp_snd_wakeup does. ++ */ ++ sk->sk_write_space(sk); ++ ++ list_del_init(&skbc->ub_sock_list); ++ ++ if (skbc->ub != ub && added) { ++ cub = get_beancounter(skbc->ub); ++ spin_unlock(&ub->ub_lock); ++ charge_beancounter_notop(cub, UB_OTHERSOCKBUF, added); ++ put_beancounter(cub); ++ spin_lock(&ub->ub_lock); ++ } ++ } ++} ++ ++static void ub_tcp_snd_wakeup(struct user_beancounter *ub) ++{ ++ struct list_head *p; ++ struct sock *sk; ++ struct sock_beancounter *skbc; ++ struct socket *sock; ++ struct user_beancounter *cub; ++ unsigned long added; ++ ++ while (!list_empty(&ub->ub_tcp_sk_list)) { ++ p = ub->ub_tcp_sk_list.next; ++ skbc = list_entry(p, struct sock_beancounter, ub_sock_list); ++ sk = skbc_sock(skbc); ++ ++ added = 0; ++ sock = sk->sk_socket; ++ if (sock == NULL) ++ /* sk being destroyed */ ++ goto cont; ++ ++ ub_debug(UBD_NET_SLEEP, ++ "Checking queue, waiting %lu, reserv %lu\n", ++ skbc->ub_waitspc, skbc->poll_reserv); ++ added = -skbc->poll_reserv; ++ if (ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF, ++ UB_NUMTCPSOCK, skbc->ub_waitspc)) ++ break; ++ added += skbc->poll_reserv; ++ ++ /* ++ * Send async notifications and wake up. ++ * Locking note: we get callback_lock here because ++ * tcp_write_space is over-optimistic about calling context ++ * (socket lock is presumed). So we get the lock here although ++ * it belongs to the callback. ++ */ ++ sk->sk_write_space(sk); ++ ++cont: ++ list_del_init(&skbc->ub_sock_list); ++ ++ if (skbc->ub != ub && added) { ++ cub = get_beancounter(skbc->ub); ++ spin_unlock(&ub->ub_lock); ++ charge_beancounter_notop(cub, UB_TCPSNDBUF, added); ++ put_beancounter(cub); ++ spin_lock(&ub->ub_lock); ++ } ++ } ++} ++ ++void ub_sock_snd_queue_add(struct sock *sk, int res, unsigned long size) ++{ ++ unsigned long flags; ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long added_reserv; ++ ++ if (!sock_has_ubc(sk)) ++ return; ++ ++ skbc = sock_bc(sk); ++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub_debug(UBD_NET_SLEEP, "attempt to charge for %lu\n", size); ++ added_reserv = -skbc->poll_reserv; ++ if (!ub_sock_makewreserv_locked(sk, res, bid2sid(res), size)) { ++ /* ++ * It looks a bit hackish, but it is compatible with both ++ * wait_for_xx_ubspace and poll. ++ * This __set_current_state is equivalent to a wakeup event ++ * right after spin_unlock_irqrestore. ++ */ ++ __set_current_state(TASK_RUNNING); ++ added_reserv += skbc->poll_reserv; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ if (added_reserv) ++ charge_beancounter_notop(skbc->ub, res, added_reserv); ++ return; ++ } ++ ++ ub_debug(UBD_NET_SLEEP, "Adding sk to queue\n"); ++ skbc->ub_waitspc = size; ++ if (!list_empty(&skbc->ub_sock_list)) { ++ ub_debug(UBD_NET_SOCKET, ++ "re-adding socket to beancounter %p.\n", ub); ++ goto out; ++ } ++ ++ switch (res) { ++ case UB_TCPSNDBUF: ++ list_add_tail(&skbc->ub_sock_list, ++ &ub->ub_tcp_sk_list); ++ break; ++ case UB_OTHERSOCKBUF: ++ list_add_tail(&skbc->ub_sock_list, ++ &ub->ub_other_sk_list); ++ break; ++ default: ++ BUG(); ++ } ++out: ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++ ++/* ++ * Helpers ++ */ ++ ++void ub_skb_set_charge(struct sk_buff *skb, struct sock *sk, ++ unsigned long size, int resource) ++{ ++ if (!sock_has_ubc(sk)) ++ return; ++ ++ if (sock_bc(sk)->ub == NULL) ++ BUG(); ++ skb_bc(skb)->ub = sock_bc(sk)->ub; ++ skb_bc(skb)->charged = size; ++ skb_bc(skb)->resource = resource; ++ ++ /* Ugly. Ugly. Skb in sk writequeue can live without ref to sk */ ++ if (skb->sk == NULL) ++ skb->sk = sk; ++} ++ ++static inline void ub_skb_set_uncharge(struct sk_buff *skb) ++{ ++ skb_bc(skb)->ub = NULL; ++ skb_bc(skb)->charged = 0; ++ skb_bc(skb)->resource = 0; ++} ++ ++static inline void __uncharge_sockbuf(struct sock_beancounter *skbc, ++ struct user_beancounter *ub, int resource, unsigned long size) ++{ ++ if (ub != NULL) ++ __uncharge_beancounter_locked(ub, resource, size); ++ ++ if (skbc != NULL) { ++ if (skbc->ub_wcharged > size) ++ skbc->ub_wcharged -= size; ++ else ++ skbc->ub_wcharged = 0; ++ } ++} ++ ++static void ub_update_rmem_thres(struct sock_beancounter *skub) ++{ ++ struct user_beancounter *ub; ++ ++ if (skub && skub->ub) { ++ for (ub = skub->ub; ub->parent != NULL; ub = ub->parent); ++ ub->ub_rmem_thres = ub->ub_parms[UB_TCPRCVBUF].barrier / ++ (ub->ub_parms[UB_NUMTCPSOCK].held + 1); ++ } ++} ++inline int ub_skb_alloc_bc(struct sk_buff *skb, int gfp_mask) ++{ ++ memset(skb_bc(skb), 0, sizeof(struct skb_beancounter)); ++ return 0; ++} ++ ++inline void ub_skb_free_bc(struct sk_buff *skb) ++{ ++} ++ ++ ++/* ++ * Charge socket number ++ */ ++ ++static inline int sk_alloc_beancounter(struct sock *sk) ++{ ++ struct sock_beancounter *skbc; ++ ++ skbc = sock_bc(sk); ++ memset(skbc, 0, sizeof(struct sock_beancounter)); ++ return 0; ++} ++ ++static inline void sk_free_beancounter(struct sock *sk) ++{ ++} ++ ++static int __sock_charge(struct sock *sk, int res) ++{ ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ ++ ub = get_exec_ub(); ++ if (ub == NULL) ++ return 0; ++ if (sk_alloc_beancounter(sk) < 0) ++ return -ENOMEM; ++ ++ skbc = sock_bc(sk); ++ INIT_LIST_HEAD(&skbc->ub_sock_list); ++ ++ if (charge_beancounter(ub, res, 1, UB_HARD) < 0) ++ goto out_limit; ++ ++ /* TCP listen sock or process keeps referrence to UB */ ++ skbc->ub = get_beancounter(ub); ++ return 0; ++ ++out_limit: ++ sk_free_beancounter(sk); ++ return -ENOMEM; ++} ++ ++int ub_tcp_sock_charge(struct sock *sk) ++{ ++ int ret; ++ ++ ret = __sock_charge(sk, UB_NUMTCPSOCK); ++ ub_update_rmem_thres(sock_bc(sk)); ++ ++ return ret; ++} ++ ++int ub_other_sock_charge(struct sock *sk) ++{ ++ return __sock_charge(sk, UB_NUMOTHERSOCK); ++} ++ ++EXPORT_SYMBOL(ub_other_sock_charge); ++ ++int ub_sock_charge(struct sock *sk, int family, int type) ++{ ++ return (IS_TCP_SOCK(family, type) ? ++ ub_tcp_sock_charge(sk) : ub_other_sock_charge(sk)); ++} ++ ++/* ++ * Uncharge socket number ++ */ ++ ++void ub_sock_uncharge(struct sock *sk) ++{ ++ int is_tcp_sock; ++ unsigned long flags; ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long reserv; ++ ++ if (!sock_has_ubc(sk)) ++ return; ++ ++ is_tcp_sock = IS_TCP_SOCK(sk->sk_family, sk->sk_type); ++ skbc = sock_bc(sk); ++ ub_debug(UBD_NET_SOCKET, "Calling ub_sock_uncharge on %p\n", sk); ++ ++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (!list_empty(&skbc->ub_sock_list)) { ++ ub_debug(UBD_NET_SOCKET, ++ "ub_sock_uncharge: removing from ub(%p) queue.\n", ++ skbc); ++ list_del_init(&skbc->ub_sock_list); ++ } ++ ++ reserv = skbc->poll_reserv; ++ __uncharge_beancounter_locked(ub, ++ (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF), ++ reserv); ++ __uncharge_beancounter_locked(ub, ++ (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1); ++ ++ /* The check sk->sk_family != PF_NETLINK is made as the skb is ++ * queued to the kernel end of socket while changed to the user one. ++ * Den */ ++ if (skbc->ub_wcharged > reserv && ++ sk->sk_family != PF_NETLINK) { ++ skbc->ub_wcharged -= reserv; ++ printk(KERN_WARNING ++ "ub_sock_uncharge: wch=%lu for ub %p (%d).\n", ++ skbc->ub_wcharged, skbc->ub, skbc->ub->ub_uid); ++ } else ++ skbc->ub_wcharged = 0; ++ skbc->poll_reserv = 0; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ uncharge_beancounter_notop(skbc->ub, ++ (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF), ++ reserv); ++ uncharge_beancounter_notop(skbc->ub, ++ (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1); ++ ++ put_beancounter(skbc->ub); ++ sk_free_beancounter(sk); ++} ++ ++/* ++ * Send - receive buffers ++ */ ++ ++/* Special case for netlink_dump - (un)charges precalculated size */ ++int ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk) ++{ ++ int ret; ++ unsigned long chargesize; ++ ++ if (!sock_has_ubc(sk)) ++ return 0; ++ ++ chargesize = skb_charge_fullsize(skb); ++ ret = charge_beancounter(sock_bc(sk)->ub, ++ UB_DGRAMRCVBUF, chargesize, UB_HARD); ++ if (ret < 0) ++ return ret; ++ ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF); ++ return ret; ++} ++ ++/* ++ * Poll reserv accounting ++ */ ++static int ub_sock_makewreserv_locked(struct sock *sk, ++ int bufid, int sockid, unsigned long size) ++{ ++ unsigned long wcharge_added; ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ ++ if (!sock_has_ubc(sk)) ++ goto out; ++ ++ skbc = sock_bc(sk); ++ if (skbc->poll_reserv >= size) /* no work to be done */ ++ goto out; ++ ++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); ++ ub->ub_parms[bufid].held += size - skbc->poll_reserv; ++ ++ wcharge_added = 0; ++ /* ++ * Logic: ++ * 1) when used memory hits barrier, we set wmem_pressure; ++ * wmem_pressure is reset under barrier/2; ++ * between barrier/2 and barrier we limit per-socket buffer growth; ++ * 2) each socket is guaranteed to get (limit-barrier)/maxsockets ++ * calculated on the base of memory eaten after the barrier is hit ++ */ ++ skbc = sock_bc(sk); ++ if (!ub_hfbarrier_hit(ub, bufid)) { ++ if (ub->ub_wmem_pressure) ++ ub_debug(UBD_NET_SEND, "makewres: pressure -> 0 " ++ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", ++ sk, size, skbc->poll_reserv, ++ ub->ub_parms[bufid].held, ++ skbc->ub_wcharged, sk->sk_sndbuf); ++ ub->ub_wmem_pressure = 0; ++ } ++ if (ub_barrier_hit(ub, bufid)) { ++ if (!ub->ub_wmem_pressure) ++ ub_debug(UBD_NET_SEND, "makewres: pressure -> 1 " ++ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", ++ sk, size, skbc->poll_reserv, ++ ub->ub_parms[bufid].held, ++ skbc->ub_wcharged, sk->sk_sndbuf); ++ ub->ub_wmem_pressure = 1; ++ wcharge_added = size - skbc->poll_reserv; ++ skbc->ub_wcharged += wcharge_added; ++ if (skbc->ub_wcharged * ub->ub_parms[sockid].limit + ++ ub->ub_parms[bufid].barrier > ++ ub->ub_parms[bufid].limit) ++ goto unroll; ++ } ++ if (ub->ub_parms[bufid].held > ub->ub_parms[bufid].limit) ++ goto unroll; ++ ++ ub_adjust_maxheld(ub, bufid); ++ skbc->poll_reserv = size; ++out: ++ return 0; ++ ++unroll: ++ ub_debug(UBD_NET_SEND, ++ "makewres: deny " ++ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", ++ sk, size, skbc->poll_reserv, ub->ub_parms[bufid].held, ++ skbc->ub_wcharged, sk->sk_sndbuf); ++ skbc->ub_wcharged -= wcharge_added; ++ ub->ub_parms[bufid].failcnt++; ++ ub->ub_parms[bufid].held -= size - skbc->poll_reserv; ++ return -ENOMEM; ++} ++ ++int ub_sock_make_wreserv(struct sock *sk, int bufid, unsigned long size) ++{ ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long flags; ++ unsigned long added_reserv; ++ int err; ++ ++ skbc = sock_bc(sk); ++ ++ /* ++ * This function provides that there is sufficient reserve upon return ++ * only if sk has only one user. We can check poll_reserv without ++ * serialization and avoid locking if the reserve already exists. ++ */ ++ if (!sock_has_ubc(sk) || skbc->poll_reserv >= size) ++ return 0; ++ ++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ added_reserv = -skbc->poll_reserv; ++ err = ub_sock_makewreserv_locked(sk, bufid, bid2sid(bufid), size); ++ added_reserv += skbc->poll_reserv; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ if (added_reserv) ++ charge_beancounter_notop(skbc->ub, bufid, added_reserv); ++ ++ return err; ++} ++ ++int ub_sock_get_wreserv(struct sock *sk, int bufid, unsigned long size) ++{ ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long flags; ++ unsigned long added_reserv; ++ int err; ++ ++ if (!sock_has_ubc(sk)) ++ return 0; ++ ++ skbc = sock_bc(sk); ++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ added_reserv = -skbc->poll_reserv; ++ err = ub_sock_makewreserv_locked(sk, bufid, bid2sid(bufid), size); ++ added_reserv += skbc->poll_reserv; ++ if (!err) ++ skbc->poll_reserv -= size; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ if (added_reserv) ++ charge_beancounter_notop(skbc->ub, bufid, added_reserv); ++ ++ return err; ++} ++ ++void ub_sock_ret_wreserv(struct sock *sk, int bufid, ++ unsigned long size, unsigned long ressize) ++{ ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long extra; ++ unsigned long flags; ++ ++ if (!sock_has_ubc(sk)) ++ return; ++ ++ extra = 0; ++ skbc = sock_bc(sk); ++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ skbc->poll_reserv += size; ++ if (skbc->poll_reserv > ressize) { ++ extra = skbc->poll_reserv - ressize; ++ __uncharge_beancounter_locked(ub, bufid, extra); ++ ++ if (skbc->ub_wcharged > skbc->poll_reserv - ressize) ++ skbc->ub_wcharged -= skbc->poll_reserv - ressize; ++ else ++ skbc->ub_wcharged = 0; ++ skbc->poll_reserv = ressize; ++ } ++ ++ ub_tcp_snd_wakeup(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ if (extra) ++ uncharge_beancounter_notop(skbc->ub, bufid, extra); ++} ++ ++long ub_sock_wait_for_space(struct sock *sk, long timeo, unsigned long size) ++{ ++ DECLARE_WAITQUEUE(wait, current); ++ ++ add_wait_queue(sk->sk_sleep, &wait); ++ for (;;) { ++ if (signal_pending(current)) ++ break; ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (!ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size)) ++ break; ++ ++ if (sk->sk_shutdown & SEND_SHUTDOWN) ++ break; ++ if (sk->sk_err) ++ break; ++ ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, size); ++ timeo = schedule_timeout(timeo); ++ } ++ __set_current_state(TASK_RUNNING); ++ remove_wait_queue(sk->sk_sleep, &wait); ++ return timeo; ++} ++ ++int ub_sock_makewres_other(struct sock *sk, unsigned long size) ++{ ++ return ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size); ++} ++ ++int ub_sock_makewres_tcp(struct sock *sk, unsigned long size) ++{ ++ return ub_sock_make_wreserv(sk, UB_TCPSNDBUF, size); ++} ++ ++int ub_sock_getwres_other(struct sock *sk, unsigned long size) ++{ ++ return ub_sock_get_wreserv(sk, UB_OTHERSOCKBUF, size); ++} ++ ++int ub_sock_getwres_tcp(struct sock *sk, unsigned long size) ++{ ++ return ub_sock_get_wreserv(sk, UB_TCPSNDBUF, size); ++} ++ ++void ub_sock_retwres_other(struct sock *sk, unsigned long size, ++ unsigned long ressize) ++{ ++ ub_sock_ret_wreserv(sk, UB_OTHERSOCKBUF, size, ressize); ++} ++ ++void ub_sock_retwres_tcp(struct sock *sk, unsigned long size, ++ unsigned long ressize) ++{ ++ ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, size, ressize); ++} ++ ++void ub_sock_sndqueueadd_other(struct sock *sk, unsigned long sz) ++{ ++ ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, sz); ++} ++ ++void ub_sock_sndqueueadd_tcp(struct sock *sk, unsigned long sz) ++{ ++ ub_sock_snd_queue_add(sk, UB_TCPSNDBUF, sz); ++} ++ ++void ub_sock_sndqueuedel(struct sock *sk) ++{ ++ struct sock_beancounter *skbc; ++ unsigned long flags; ++ ++ if (!sock_has_ubc(sk)) ++ return; ++ skbc = sock_bc(sk); ++ ++ /* race with write_space callback of other socket */ ++ spin_lock_irqsave(&skbc->ub->ub_lock, flags); ++ list_del_init(&skbc->ub_sock_list); ++ spin_unlock_irqrestore(&skbc->ub->ub_lock, flags); ++} ++ ++/* ++ * UB_DGRAMRCVBUF ++ */ ++ ++int ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb) ++{ ++ unsigned long chargesize; ++ ++ if (!sock_has_ubc(sk)) ++ return 0; ++ ++ chargesize = skb_charge_fullsize(skb); ++ if (charge_beancounter(sock_bc(sk)->ub, UB_DGRAMRCVBUF, ++ chargesize, UB_HARD)) ++ return -ENOMEM; ++ ++ ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF); ++ return 0; ++} ++ ++EXPORT_SYMBOL(ub_sockrcvbuf_charge); ++ ++static void ub_sockrcvbuf_uncharge(struct sk_buff *skb) ++{ ++ uncharge_beancounter(skb_bc(skb)->ub, UB_DGRAMRCVBUF, ++ skb_bc(skb)->charged); ++ ub_skb_set_uncharge(skb); ++} ++ ++/* ++ * UB_TCPRCVBUF ++ */ ++static int charge_tcprcvbuf(struct sock *sk, struct sk_buff *skb, ++ enum severity strict) ++{ ++ int retval; ++ unsigned long flags; ++ struct user_beancounter *ub; ++ unsigned long chargesize; ++ ++ if (!sock_has_ubc(sk)) ++ return 0; ++ ++ /* ++ * Memory pressure reactions: ++ * 1) set UB_RMEM_KEEP (clearing UB_RMEM_EXPAND) ++ * 2) set UB_RMEM_SHRINK and tcp_clamp_window() ++ * tcp_collapse_queues() if rmem_alloc > rcvbuf ++ * 3) drop OFO, tcp_purge_ofo() ++ * 4) drop all. ++ * Currently, we do #2 and #3 at once (which means that current ++ * collapsing of OFO queue in tcp_collapse_queues() is a waste of time, ++ * for example...) ++ * On memory pressure we jump from #0 to #3, and when the pressure ++ * subsides, to #1. ++ */ ++ retval = 0; ++ chargesize = skb_charge_fullsize(skb); ++ ++ for (ub = sock_bc(sk)->ub; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_parms[UB_TCPRCVBUF].held += chargesize; ++ if (ub->ub_parms[UB_TCPRCVBUF].held > ++ ub->ub_parms[UB_TCPRCVBUF].barrier && ++ strict != UB_FORCE) ++ goto excess; ++ ub_adjust_maxheld(ub, UB_TCPRCVBUF); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++out: ++ if (retval == 0) { ++ charge_beancounter_notop(sock_bc(sk)->ub, UB_TCPRCVBUF, ++ chargesize); ++ ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF); ++ } ++ return retval; ++ ++excess: ++ ub->ub_rmem_pressure = UB_RMEM_SHRINK; ++ if (strict == UB_HARD) ++ retval = -ENOMEM; ++ if (ub->ub_parms[UB_TCPRCVBUF].held > ub->ub_parms[UB_TCPRCVBUF].limit) ++ retval = -ENOMEM; ++ /* ++ * We try to leave numsock*maxadvmss as a reserve for sockets not ++ * queueing any data yet (if the difference between the barrier and the ++ * limit is enough for this reserve). ++ */ ++ if (ub->ub_parms[UB_TCPRCVBUF].held + ++ ub->ub_parms[UB_NUMTCPSOCK].limit * ub->ub_maxadvmss ++ > ub->ub_parms[UB_TCPRCVBUF].limit && ++ atomic_read(&sk->sk_rmem_alloc)) ++ retval = -ENOMEM; ++ if (retval) { ++ ub->ub_parms[UB_TCPRCVBUF].held -= chargesize; ++ ub->ub_parms[UB_TCPRCVBUF].failcnt++; ++ } ++ ub_adjust_maxheld(ub, UB_TCPRCVBUF); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ goto out; ++} ++ ++int ub_tcprcvbuf_charge(struct sock *sk, struct sk_buff *skb) ++{ ++ return charge_tcprcvbuf(sk, skb, UB_HARD); ++} ++ ++int ub_tcprcvbuf_charge_forced(struct sock *sk, struct sk_buff *skb) ++{ ++ return charge_tcprcvbuf(sk, skb, UB_FORCE); ++} ++ ++static void ub_tcprcvbuf_uncharge(struct sk_buff *skb) ++{ ++ unsigned long flags; ++ unsigned long held, bar; ++ int prev_pres; ++ struct user_beancounter *ub; ++ ++ for (ub = skb_bc(skb)->ub; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (ub->ub_parms[UB_TCPRCVBUF].held < skb_bc(skb)->charged) { ++ printk(KERN_ERR "Uncharging %d for tcprcvbuf of %p with %lu\n", ++ skb_bc(skb)->charged, ++ ub, ub->ub_parms[UB_TCPRCVBUF].held); ++ /* ass-saving bung */ ++ skb_bc(skb)->charged = ub->ub_parms[UB_TCPRCVBUF].held; ++ } ++ ub->ub_parms[UB_TCPRCVBUF].held -= skb_bc(skb)->charged; ++ held = ub->ub_parms[UB_TCPRCVBUF].held; ++ bar = ub->ub_parms[UB_TCPRCVBUF].barrier; ++ prev_pres = ub->ub_rmem_pressure; ++ if (held <= bar - (bar >> 2)) ++ ub->ub_rmem_pressure = UB_RMEM_EXPAND; ++ else if (held <= bar) ++ ub->ub_rmem_pressure = UB_RMEM_KEEP; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ uncharge_beancounter_notop(skb_bc(skb)->ub, UB_TCPRCVBUF, ++ skb_bc(skb)->charged); ++ ub_skb_set_uncharge(skb); ++} ++ ++ ++/* ++ * UB_OTHERSOCKBUF ++ */ ++ ++static void ub_socksndbuf_uncharge(struct sk_buff *skb) ++{ ++ unsigned long flags; ++ struct user_beancounter *ub, *cub; ++ struct sock_beancounter *sk_bc; ++ ++ /* resource was set. no check for ub required */ ++ cub = skb_bc(skb)->ub; ++ for (ub = cub; ub->parent != NULL; ub = ub->parent); ++ skb_bc(skb)->ub = NULL; ++ if (skb->sk != NULL) ++ sk_bc = sock_bc(skb->sk); ++ else ++ sk_bc = NULL; ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __uncharge_sockbuf(sk_bc, ub, UB_OTHERSOCKBUF, ++ skb_bc(skb)->charged); ++ ub_sock_snd_wakeup(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ uncharge_beancounter_notop(cub, UB_OTHERSOCKBUF, skb_bc(skb)->charged); ++ ub_skb_set_uncharge(skb); ++} ++ ++static void ub_tcpsndbuf_uncharge(struct sk_buff *skb) ++{ ++ unsigned long flags; ++ struct user_beancounter *ub, *cub; ++ ++ /* resource can be not set, called manually */ ++ cub = skb_bc(skb)->ub; ++ if (cub == NULL) ++ return; ++ for (ub = cub; ub->parent != NULL; ub = ub->parent); ++ skb_bc(skb)->ub = NULL; ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __uncharge_sockbuf(sock_bc(skb->sk), ub, UB_TCPSNDBUF, ++ skb_bc(skb)->charged); ++ ub_tcp_snd_wakeup(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ uncharge_beancounter_notop(cub, UB_TCPSNDBUF, skb_bc(skb)->charged); ++ ub_skb_set_uncharge(skb); ++} ++ ++void ub_skb_uncharge(struct sk_buff *skb) ++{ ++ switch (skb_bc(skb)->resource) { ++ case UB_TCPSNDBUF: ++ ub_tcpsndbuf_uncharge(skb); ++ break; ++ case UB_TCPRCVBUF: ++ ub_tcprcvbuf_uncharge(skb); ++ break; ++ case UB_DGRAMRCVBUF: ++ ub_sockrcvbuf_uncharge(skb); ++ break; ++ case UB_OTHERSOCKBUF: ++ ub_socksndbuf_uncharge(skb); ++ break; ++ } ++} ++ ++EXPORT_SYMBOL(ub_skb_uncharge); /* due to skb_orphan()/conntracks */ ++ ++/* ++ * TCP send buffers accouting. Paged part ++ */ ++int ub_sock_tcp_chargepage(struct sock *sk) ++{ ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long added; ++ unsigned long flags; ++ int err; ++ ++ if (!sock_has_ubc(sk)) ++ return 0; ++ ++ skbc = sock_bc(sk); ++ ++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ /* Try to charge full page */ ++ err = ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF, UB_NUMTCPSOCK, ++ PAGE_SIZE); ++ if (err == 0) { ++ skbc->poll_reserv -= PAGE_SIZE; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, PAGE_SIZE); ++ return 0; ++ } ++ ++ /* Try to charge page enough to satisfy sys_select. The possible ++ overdraft for the rest of the page is generally better then ++ requesting full page in tcp_poll. This should not happen ++ frequently. Den */ ++ added = -skbc->poll_reserv; ++ err = ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF, UB_NUMTCPSOCK, ++ SOCK_MIN_UBCSPACE); ++ if (err < 0) { ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return err; ++ } ++ __charge_beancounter_locked(ub, UB_TCPSNDBUF, ++ PAGE_SIZE - skbc->poll_reserv, ++ UB_FORCE); ++ added += PAGE_SIZE; ++ skbc->poll_reserv = 0; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, added); ++ ++ return 0; ++ ++} ++ ++void ub_sock_tcp_detachpage(struct sock *sk) ++{ ++ struct sk_buff *skb; ++ ++ if (!sock_has_ubc(sk)) ++ return; ++ ++ /* The page is just detached from socket. The last skb in queue ++ with paged part holds referrence to it */ ++ skb = skb_peek_tail(&sk->sk_write_queue); ++ if (skb == NULL) { ++ /* If the queue is empty - all data is sent and page is about ++ to be freed */ ++ uncharge_beancounter(sock_bc(sk)->ub, UB_TCPSNDBUF, PAGE_SIZE); ++ return; ++ } ++ /* Last skb is a good aproximation for a last skb with paged part */ ++ skb_bc(skb)->charged += PAGE_SIZE; ++} ++ ++static int charge_tcpsndbuf(struct sock *sk, struct sk_buff *skb, ++ enum severity strict) ++{ ++ int ret; ++ unsigned long chargesize; ++ ++ if (!sock_has_ubc(sk)) ++ return 0; ++ ++ chargesize = skb_charge_fullsize(skb); ++ ret = charge_beancounter(sock_bc(sk)->ub, UB_TCPSNDBUF, chargesize, ++ strict); ++ if (ret < 0) ++ return ret; ++ ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); ++ sock_bc(sk)->ub_wcharged += chargesize; ++ return ret; ++} ++ ++int ub_tcpsndbuf_charge(struct sock *sk, struct sk_buff *skb) ++{ ++ return charge_tcpsndbuf(sk, skb, UB_HARD); ++} ++ ++int ub_tcpsndbuf_charge_forced(struct sock *sk, struct sk_buff *skb) ++{ ++ return charge_tcpsndbuf(sk, skb, UB_FORCE); ++} ++ ++/* ++ * Initialization staff ++ */ ++int __init skbc_cache_init(void) ++{ ++ return 0; ++} +diff -uprN linux-2.6.15.orig/kernel/ub/ub_oom.c linux-2.6.15-ve025stab014/kernel/ub/ub_oom.c +--- linux-2.6.15.orig/kernel/ub/ub_oom.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/ub/ub_oom.c 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,157 @@ ++/* ++ * kernel/ub/ub_oom.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/sched.h> ++#include <linux/spinlock.h> ++#include <linux/mm.h> ++#include <linux/swap.h> ++ ++#include <asm/page.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_oom.h> ++#include <ub/ub_misc.h> ++#include <ub/ub_hash.h> ++ ++static DEFINE_SPINLOCK(oom_generation_lock); ++static int oom_generation = 0; ++static int oom_kill_counter = 0; ++ ++void ub_oom_kill_task(struct task_struct *tsk, struct mm_struct * mm) ++{ ++ struct user_beancounter *ub; ++ static struct ub_rate_info ri = { 5, 60*HZ }; ++ char ub_uid[64]; ++ ++ ub = mm->mm_ub; ++ if (ub) ++ print_ub_uid(ub, ub_uid, sizeof(ub_uid)); ++ else { ++ ub_uid[0] = '-'; ++ ub_uid[1] = '1'; ++ ub_uid[2] = '\0'; ++ } ++ ++ printk(KERN_INFO"MM to kill %p (UB=%s, VM=%lu, free=%u).\n", ++ mm, ub_uid, mm->total_vm, nr_free_pages()); ++ ++ if (ub_ratelimit(&ri)) ++ show_mem(); ++ ++ WARN_ON(!spin_is_locked(&oom_generation_lock)); ++ WARN_ON(!test_ti_thread_flag(tsk->thread_info, TIF_MEMDIE)); ++ mm->mm_ub->ub_parms[UB_OOMGUARPAGES].failcnt++; ++ oom_kill_counter++; ++} ++ ++void ub_oom_init(void) ++{ ++ current->task_bc.oom_generation = oom_generation; ++} ++ ++int ub_oom_start(void) ++{ ++ int i; ++ unsigned long flags; ++ struct user_beancounter *ub; ++ ++ i = current->task_bc.oom_generation; ++ spin_lock(&oom_generation_lock); ++ /* Someone already helped us */ ++ if (i != oom_generation) { ++ spin_unlock(&oom_generation_lock); ++ return -EALREADY; ++ } ++ /* OOM in progress */ ++ if (oom_kill_counter) { ++ spin_unlock(&oom_generation_lock); ++ schedule_timeout_uninterruptible(5 * HZ); ++ spin_lock(&oom_generation_lock); ++ if (i != oom_generation) { ++ spin_unlock(&oom_generation_lock); ++ return -EALREADY; ++ } ++ } ++ /* ++ * Some process is stuck exiting. ++ * No choice other than to kill something else. ++ * Return with oom_generation_lock held hoping ++ * not to forget to drop it later ++ */ ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ for_each_beancounter(i, ub) ++ ub->ub_oom_noproc = 0; ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return 0; ++} ++ ++static inline long ub_current_overdraft(struct user_beancounter *ub) ++{ ++ return ub->ub_parms[UB_OOMGUARPAGES].held + ++ ((ub->ub_parms[UB_KMEMSIZE].held ++ + ub->ub_parms[UB_TCPSNDBUF].held ++ + ub->ub_parms[UB_TCPRCVBUF].held ++ + ub->ub_parms[UB_OTHERSOCKBUF].held ++ + ub->ub_parms[UB_DGRAMRCVBUF].held) ++ >> PAGE_SHIFT) - ub->ub_parms[UB_OOMGUARPAGES].barrier; ++} ++ ++/* ++ * Select an user_beancounter to find task inside it to be killed. ++ * Select the beancounter with the biggest excess of resource usage ++ * to kill a process belonging to that beancounter later, or returns ++ * NULL if there are no beancounters with such excess. ++ */ ++ ++struct user_beancounter *ub_oom_select_worst(void) ++{ ++ struct user_beancounter *ub, *walkp; ++ unsigned long flags, ub_maxover; ++ int i; ++ ++ ub_maxover = 0; ++ ub = NULL; ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ ++ for_each_beancounter(i, walkp) { ++ if (walkp->parent != NULL) ++ continue; ++ if (walkp->ub_oom_noproc) ++ continue; ++ if (ub_current_overdraft(walkp) <= ub_maxover) ++ continue; ++ ++ ub = walkp; ++ } ++ if(ub) { ++ get_beancounter(ub); ++ ub->ub_oom_noproc = 1; ++ } ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return ub; ++} ++ ++void ub_oom_stop(void) ++{ ++ spin_unlock(&oom_generation_lock); ++} ++ ++void ub_oom_task_exit(struct task_struct *tsk) ++{ ++ /* In order to allow OOM to happen from now on */ ++ spin_lock(&oom_generation_lock); ++ if (test_ti_thread_flag(tsk->thread_info, TIF_MEMDIE)) { ++ if (--oom_kill_counter == 0) ++ oom_generation++; ++ printk("OOM killed process %d exited, free=%u.\n", ++ tsk->pid, nr_free_pages()); ++ } ++ spin_unlock(&oom_generation_lock); ++} +diff -uprN linux-2.6.15.orig/kernel/ub/ub_page_bc.c linux-2.6.15-ve025stab014/kernel/ub/ub_page_bc.c +--- linux-2.6.15.orig/kernel/ub/ub_page_bc.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/ub/ub_page_bc.c 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,418 @@ ++/* ++ * kernel/ub/ub_page_bc.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/spinlock.h> ++#include <linux/slab.h> ++#include <linux/mm.h> ++#include <linux/gfp.h> ++#include <linux/vmalloc.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_hash.h> ++#include <ub/ub_vmpages.h> ++#include <ub/ub_page.h> ++ ++static kmem_cache_t *pb_cachep; ++static spinlock_t pb_lock = SPIN_LOCK_UNLOCKED; ++static struct page_beancounter **pb_hash_table; ++static unsigned int pb_hash_mask; ++ ++/* ++ * Auxiliary staff ++ */ ++ ++static inline struct page_beancounter *next_page_pb(struct page_beancounter *p) ++{ ++ return list_entry(p->page_list.next, struct page_beancounter, ++ page_list); ++} ++ ++static inline struct page_beancounter *prev_page_pb(struct page_beancounter *p) ++{ ++ return list_entry(p->page_list.prev, struct page_beancounter, ++ page_list); ++} ++ ++/* ++ * Held pages manipulation ++ */ ++static inline void set_held_pages(struct user_beancounter *bc) ++{ ++ /* all three depend on ub_held_pages */ ++ __ub_update_physpages(bc); ++ __ub_update_oomguarpages(bc); ++ __ub_update_privvm(bc); ++} ++ ++static inline void do_dec_held_pages(struct user_beancounter *ub, int value) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_held_pages -= value; ++ set_held_pages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++static void dec_held_pages(struct user_beancounter *ub, int value) ++{ ++ for (; ub != NULL; ub = ub->parent) ++ do_dec_held_pages(ub, value); ++} ++ ++static inline void do_inc_held_pages(struct user_beancounter *ub, int value) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_held_pages += value; ++ set_held_pages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++static void inc_held_pages(struct user_beancounter *ub, int value) ++{ ++ for (; ub != NULL; ub = ub->parent) ++ do_inc_held_pages(ub, value); ++} ++ ++/* ++ * Alloc - free ++ */ ++ ++inline int __pb_alloc(struct page_beancounter **pbc, gfp_t mask) ++{ ++ *pbc = kmem_cache_alloc(pb_cachep, mask); ++ if (*pbc != NULL) ++ (*pbc)->pb_magic = PB_MAGIC; ++ return (*pbc == NULL); ++} ++ ++inline void pb_free(struct page_beancounter **pb) ++{ ++ if (*pb != NULL) { ++ kmem_cache_free(pb_cachep, *pb); ++ *pb = NULL; ++ } ++} ++ ++void pb_free_list(struct page_beancounter **p_pb) ++{ ++ struct page_beancounter *list = *p_pb, *pb; ++ while (list) { ++ pb = list; ++ list = list->next_hash; ++ pb_free(&pb); ++ } ++ *p_pb = NULL; ++} ++ ++/* ++ * head -> <new objs> -> <old objs> -> ... ++ */ ++static int __alloc_list(struct page_beancounter **head, int num) ++{ ++ struct page_beancounter *pb; ++ ++ while (num > 0) { ++ if (pb_alloc(&pb)) ++ return -1; ++ pb->next_hash = *head; ++ *head = pb; ++ num--; ++ } ++ ++ return num; ++} ++ ++/* ++ * Ensure that the list contains at least num elements. ++ * p_pb points to an initialized list, may be of the zero length. ++ * ++ * mm->page_table_lock should be held ++ */ ++int pb_alloc_list(struct page_beancounter **p_pb, int num) ++{ ++ struct page_beancounter *list; ++ ++ for (list = *p_pb; list != NULL && num; list = list->next_hash, num--); ++ if (!num) ++ return 0; ++ ++ /* ++ * *p_pb(after) *p_pb (before) ++ * \ \ ++ * <new objs> -...-> <old objs> -> ... ++ */ ++ if (__alloc_list(p_pb, num) < 0) ++ goto nomem; ++ return 0; ++ ++nomem: ++ pb_free_list(p_pb); ++ return -ENOMEM; ++} ++ ++/* ++ * Hash routines ++ */ ++ ++static inline int pb_hash(struct user_beancounter *ub, struct page *page) ++{ ++ return (((unsigned long)ub << 16) + ((unsigned long)ub >> 16) + ++ (page_to_pfn(page) >> 7)) & pb_hash_mask; ++} ++ ++/* pb_lock should be held */ ++static inline void insert_pb(struct page_beancounter *p, struct page *page, ++ struct user_beancounter *ub, int hash) ++{ ++ p->page = page; ++ p->ub = get_beancounter(ub); ++ p->next_hash = pb_hash_table[hash]; ++ pb_hash_table[hash] = p; ++#ifdef CONFIG_UBC_DEBUG_KMEM ++ ub->ub_stat[smp_processor_id()].pbcs++; ++#endif ++} ++ ++/* ++ * Heart ++ */ ++ ++int pb_alloc_all(struct page_beancounter **pbs) ++{ ++ int i, need_alloc; ++ unsigned long flags; ++ struct user_beancounter *ub; ++ ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ need_alloc = 0; ++ for_each_beancounter(i, ub) ++ need_alloc++; ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ++ if (!__alloc_list(pbs, need_alloc)) ++ return 0; ++ ++ pb_free_list(pbs); ++ return -ENOMEM; ++} ++ ++int pb_add_ref(struct page *page, struct mm_struct *mm, ++ struct page_beancounter **p_pb) ++{ ++ int hash; ++ struct user_beancounter *bc; ++ struct page_beancounter *p; ++ int shift; ++ struct page_beancounter *head; ++ ++ bc = mm->mm_ub; ++ if (bc == NULL) ++ return 0; ++ ++ if (!PageAnon(page) && is_shmem_mapping(page->mapping)) ++ return 0; ++ ++ hash = pb_hash(bc, page); ++ ++ spin_lock(&pb_lock); ++ for (p = pb_hash_table[hash]; ++ p != NULL && (p->page != page || p->ub != bc); ++ p = p->next_hash); ++ if (p != NULL) { ++ /* ++ * This page is already associated with this beancounter, ++ * increment the usage counter. ++ */ ++ PB_COUNT_INC(p->refcount); ++ spin_unlock(&pb_lock); ++ return 0; ++ } ++ ++ p = *p_pb; ++ if (p == NULL) { ++ spin_unlock(&pb_lock); ++ return -1; ++ } ++ ++ *p_pb = NULL; ++ insert_pb(p, page, bc, hash); ++ head = page_pbc(page); ++ ++ if (head != NULL) { ++ /* ++ * Move the first element to the end of the list. ++ * List head (pb_head) is set to the next entry. ++ * Note that this code works even if head is the only element ++ * on the list (because it's cyclic). ++ */ ++ BUG_ON(head->pb_magic != PB_MAGIC); ++ page_pbc(page) = next_page_pb(head); ++ PB_SHIFT_INC(head->refcount); ++ shift = PB_SHIFT_GET(head->refcount); ++ /* ++ * Update user beancounter, the share of head has been changed. ++ * Note that the shift counter is taken after increment. ++ */ ++ dec_held_pages(head->ub, UB_PAGE_WEIGHT >> shift); ++ /* add the new page beancounter to the end of the list */ ++ list_add_tail(&p->page_list, &page_pbc(page)->page_list); ++ } else { ++ page_pbc(page) = p; ++ shift = 0; ++ INIT_LIST_HEAD(&p->page_list); ++ } ++ ++ p->refcount = PB_REFCOUNT_MAKE(shift, 1); ++ spin_unlock(&pb_lock); ++ ++ /* update user beancounter for the new page beancounter */ ++ inc_held_pages(bc, UB_PAGE_WEIGHT >> shift); ++ return 0; ++} ++ ++void pb_remove_ref(struct page *page, struct mm_struct *mm) ++{ ++ int hash; ++ struct user_beancounter *bc; ++ struct page_beancounter *p, **q; ++ int shift, shiftt; ++ ++ bc = mm->mm_ub; ++ if (bc == NULL) ++ return; ++ ++ if (!PageAnon(page) && is_shmem_mapping(page->mapping)) ++ return; ++ ++ hash = pb_hash(bc, page); ++ ++ spin_lock(&pb_lock); ++ BUG_ON(page_pbc(page) != NULL && page_pbc(page)->pb_magic != PB_MAGIC); ++ for (q = pb_hash_table + hash, p = *q; ++ p != NULL && (p->page != page || p->ub != bc); ++ q = &p->next_hash, p = *q); ++ if (p == NULL) ++ goto out_unlock; ++ ++ PB_COUNT_DEC(p->refcount); ++ if (PB_COUNT_GET(p->refcount)) ++ /* ++ * More references from the same user beancounter exist. ++ * Nothing needs to be done. ++ */ ++ goto out_unlock; ++ ++ /* remove from the hash list */ ++ *q = p->next_hash; ++ ++ shift = PB_SHIFT_GET(p->refcount); ++ ++ dec_held_pages(p->ub, UB_PAGE_WEIGHT >> shift); ++ ++ if (page_pbc(page) == p) { ++ if (list_empty(&p->page_list)) ++ goto out_free; ++ page_pbc(page) = next_page_pb(p); ++ } ++ list_del(&p->page_list); ++ put_beancounter(p->ub); ++#ifdef CONFIG_UBC_DEBUG_KMEM ++ p->ub->ub_stat[smp_processor_id()].pbcs--; ++#endif ++ pb_free(&p); ++ ++ /* Now balance the list. Move the tail and adjust its shift counter. */ ++ p = prev_page_pb(page_pbc(page)); ++ shiftt = PB_SHIFT_GET(p->refcount); ++ page_pbc(page) = p; ++ PB_SHIFT_DEC(p->refcount); ++ ++ inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt); ++ ++ /* ++ * If the shift counter of the moved beancounter is different from the ++ * removed one's, repeat the procedure for one more tail beancounter ++ */ ++ if (shiftt > shift) { ++ p = prev_page_pb(page_pbc(page)); ++ page_pbc(page) = p; ++ PB_SHIFT_DEC(p->refcount); ++ inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt); ++ } ++ spin_unlock(&pb_lock); ++ return; ++ ++out_free: ++ page_pbc(page) = NULL; ++#ifdef CONFIG_UBC_DEBUG_KMEM ++ p->ub->ub_stat[smp_processor_id()].pbcs--; ++#endif ++ put_beancounter(p->ub); ++ pb_free(&p); ++out_unlock: ++ spin_unlock(&pb_lock); ++ return; ++} ++ ++void pb_add_list_ref(struct page *page, struct mm_struct *mm, ++ struct page_beancounter **p_pb) ++{ ++ struct page_beancounter *list, *pb; ++ ++ pb = *p_pb; ++ if (pb == NULL) { ++ /* Typical case due to caller constraints */ ++ if (pb_add_ref(page, mm, &pb)) ++ BUG(); ++ return; ++ } ++ ++ list = pb->next_hash; ++ if (pb_add_ref(page, mm, &pb)) ++ BUG(); ++ if (pb != NULL) { ++ pb->next_hash = list; ++ list = pb; ++ } ++ *p_pb = list; ++} ++ ++struct user_beancounter *pb_grab_page_ub(struct page *page) ++{ ++ struct page_beancounter *pb; ++ struct user_beancounter *ub; ++ ++ spin_lock(&pb_lock); ++ pb = page_pbc(page); ++ ub = (pb == NULL ? ERR_PTR(-EINVAL) : ++ get_beancounter(pb->ub)); ++ spin_unlock(&pb_lock); ++ return ub; ++} ++ ++void __init ub_init_pbc(void) ++{ ++ unsigned long hash_size; ++ ++ pb_cachep = kmem_cache_create("page_beancounter", ++ sizeof(struct page_beancounter), 0, ++ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL); ++ hash_size = num_physpages >> 2; ++ for (pb_hash_mask = 1; ++ (hash_size & pb_hash_mask) != hash_size; ++ pb_hash_mask = (pb_hash_mask << 1) + 1); ++ hash_size = pb_hash_mask + 1; ++ printk(KERN_INFO "Page beancounter hash is %lu entries.\n", hash_size); ++ pb_hash_table = vmalloc(hash_size * sizeof(struct page_beancounter *)); ++ memset(pb_hash_table, 0, hash_size * sizeof(struct page_beancounter *)); ++} +diff -uprN linux-2.6.15.orig/kernel/ub/ub_pages.c linux-2.6.15-ve025stab014/kernel/ub/ub_pages.c +--- linux-2.6.15.orig/kernel/ub/ub_pages.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/ub/ub_pages.c 2006-01-27 14:48:07.000000000 +0300 +@@ -0,0 +1,530 @@ ++/* ++ * kernel/ub/ub_pages.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/mm.h> ++#include <linux/highmem.h> ++#include <linux/virtinfo.h> ++#include <linux/module.h> ++#include <linux/shmem_fs.h> ++#include <linux/vmalloc.h> ++ ++#include <asm/pgtable.h> ++#include <asm/page.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_vmpages.h> ++ ++void warn_bad_rss(struct vm_area_struct *vma, unsigned long freed) ++{ ++ static struct ub_rate_info ri = { ++ .burst = 10, ++ .interval = 40 * HZ, ++ }; ++ struct user_beancounter *ub; ++ char ubuid[64] = "No UB"; ++ unsigned long vmrss; ++ ++ if (!ub_ratelimit(&ri)) ++ return; ++ ++ ub = vma->vm_mm->mm_ub; ++ if (ub) ++ print_ub_uid(ub, ubuid, sizeof(ubuid)); ++ ++ vmrss = get_vma_rss(vma) + freed; ++ printk(KERN_WARNING ++ "%s vm_rss: process pid %d comm %.20s flags %lx\n" ++ "vma %p/%p rss %lu/%lu freed %lu\n" ++ "flags %lx, ub %s\n", ++ vmrss > freed ? "Positive" : "Negative", ++ current->pid, current->comm, current->flags, ++ vma, vma->vm_mm, vmrss, vma_pages(vma), freed, ++ vma->vm_flags, ubuid); ++ dump_stack(); ++} ++ ++static inline unsigned long pages_in_pte_range(struct vm_area_struct *vma, ++ pmd_t *pmd, unsigned long addr, unsigned long end, ++ unsigned long *ret) ++{ ++ pte_t *pte; ++ spinlock_t *ptl; ++ ++ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); ++ do { ++ if (!pte_none(*pte) && pte_present(*pte)) ++ (*ret)++; ++ } while (pte++, addr += PAGE_SIZE, (addr != end)); ++ pte_unmap_unlock(pte - 1, ptl); ++ ++ return addr; ++} ++ ++static inline unsigned long pages_in_pmd_range(struct vm_area_struct *vma, ++ pud_t *pud, unsigned long addr, unsigned long end, ++ unsigned long *ret) ++{ ++ pmd_t *pmd; ++ unsigned long next; ++ ++ pmd = pmd_offset(pud, addr); ++ do { ++ next = pmd_addr_end(addr, end); ++ if (pmd_none_or_clear_bad(pmd)) ++ continue; ++ next = pages_in_pte_range(vma, pmd, addr, next, ret); ++ } while (pmd++, addr = next, (addr != end)); ++ ++ return addr; ++} ++ ++static inline unsigned long pages_in_pud_range(struct vm_area_struct *vma, ++ pgd_t *pgd, unsigned long addr, unsigned long end, ++ unsigned long *ret) ++{ ++ pud_t *pud; ++ unsigned long next; ++ ++ pud = pud_offset(pgd, addr); ++ do { ++ next = pud_addr_end(addr, end); ++ if (pud_none_or_clear_bad(pud)) ++ continue; ++ next = pages_in_pmd_range(vma, pud, addr, next, ret); ++ } while (pud++, addr = next, (addr != end)); ++ ++ return addr; ++} ++ ++unsigned long pages_in_vma_range(struct vm_area_struct *vma, ++ unsigned long addr, unsigned long end) ++{ ++ pgd_t *pgd; ++ unsigned long next; ++ unsigned long ret; ++ ++ ret = 0; ++ BUG_ON(addr >= end); ++ pgd = pgd_offset(vma->vm_mm, addr); ++ do { ++ next = pgd_addr_end(addr, end); ++ if (pgd_none_or_clear_bad(pgd)) ++ continue; ++ next = pages_in_pud_range(vma, pgd, addr, next, &ret); ++ } while (pgd++, addr = next, (addr != end)); ++ return ret; ++} ++ ++void fastcall __ub_update_physpages(struct user_beancounter *ub) ++{ ++ ub->ub_parms[UB_PHYSPAGES].held = ub->ub_tmpfs_respages ++ + (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT); ++ ub_adjust_maxheld(ub, UB_PHYSPAGES); ++} ++ ++void fastcall __ub_update_oomguarpages(struct user_beancounter *ub) ++{ ++ ub->ub_parms[UB_OOMGUARPAGES].held = ++ ub->ub_parms[UB_PHYSPAGES].held + ub->ub_swap_pages; ++ ub_adjust_maxheld(ub, UB_OOMGUARPAGES); ++} ++ ++void fastcall __ub_update_privvm(struct user_beancounter *ub) ++{ ++ ub->ub_parms[UB_PRIVVMPAGES].held = ++ (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT) ++ + ub->ub_unused_privvmpages ++ + ub->ub_parms[UB_SHMPAGES].held; ++ ub_adjust_maxheld(ub, UB_PRIVVMPAGES); ++} ++ ++static inline int __charge_privvm_locked(struct user_beancounter *ub, ++ unsigned long s, enum severity strict) ++{ ++ if (__charge_beancounter_locked(ub, UB_PRIVVMPAGES, s, strict) < 0) ++ return -ENOMEM; ++ ++ ub->ub_unused_privvmpages += s; ++ return 0; ++} ++ ++static void __unused_privvm_dec_locked(struct user_beancounter *ub, ++ long size) ++{ ++ /* catch possible overflow */ ++ if (ub->ub_unused_privvmpages < size) { ++ uncharge_warn(ub, UB_UNUSEDPRIVVM, ++ size, ub->ub_unused_privvmpages); ++ size = ub->ub_unused_privvmpages; ++ } ++ ub->ub_unused_privvmpages -= size; ++ __ub_update_privvm(ub); ++} ++ ++void __ub_unused_privvm_dec(struct mm_struct *mm, long size) ++{ ++ unsigned long flags; ++ struct user_beancounter *ub; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return; ++ ++ for (; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __unused_privvm_dec_locked(ub, size); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++void ub_unused_privvm_sub(struct mm_struct *mm, ++ struct vm_area_struct *vma, unsigned long count) ++{ ++ if (VM_UB_PRIVATE(vma->vm_flags, vma->vm_file)) ++ __ub_unused_privvm_dec(mm, count); ++} ++ ++void ub_unused_privvm_add(struct mm_struct *mm, ++ struct vm_area_struct *vma, unsigned long size) ++{ ++ unsigned long flags; ++ struct user_beancounter *ub; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL || !VM_UB_PRIVATE(vma->vm_flags, vma->vm_file)) ++ return; ++ ++ for (; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_unused_privvmpages += size; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++int ub_protected_charge(struct mm_struct *mm, unsigned long size, ++ unsigned long newflags, struct vm_area_struct *vma) ++{ ++ unsigned long flags; ++ struct file *file; ++ struct user_beancounter *ub; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return PRIVVM_NO_CHARGE; ++ ++ flags = vma->vm_flags; ++ if (!((newflags ^ flags) & VM_WRITE)) ++ return PRIVVM_NO_CHARGE; ++ ++ file = vma->vm_file; ++ if (!VM_UB_PRIVATE(newflags | VM_WRITE, file)) ++ return PRIVVM_NO_CHARGE; ++ ++ if (flags & VM_WRITE) ++ return PRIVVM_TO_SHARED; ++ ++ for (; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (__charge_privvm_locked(ub, size, UB_SOFT) < 0) ++ goto err; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return PRIVVM_TO_PRIVATE; ++ ++err: ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return PRIVVM_ERROR; ++} ++ ++int ub_memory_charge(struct mm_struct *mm, unsigned long size, ++ unsigned vm_flags, struct file *vm_file, int sv) ++{ ++ struct user_beancounter *ub, *ubl; ++ unsigned long flags; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return 0; ++ ++ size >>= PAGE_SHIFT; ++ if (size > UB_MAXVALUE) ++ return -EINVAL; ++ ++ BUG_ON(sv != UB_SOFT && sv != UB_HARD); ++ ++ if (vm_flags & VM_LOCKED) { ++ if (charge_beancounter(ub, UB_LOCKEDPAGES, size, sv)) ++ goto out_err; ++ } ++ if (VM_UB_PRIVATE(vm_flags, vm_file)) { ++ for (ubl = ub; ubl->parent != NULL; ubl = ubl->parent); ++ spin_lock_irqsave(&ubl->ub_lock, flags); ++ if (__charge_privvm_locked(ubl, size, sv)) ++ goto out_private; ++ spin_unlock_irqrestore(&ubl->ub_lock, flags); ++ } ++ return 0; ++ ++out_private: ++ spin_unlock_irqrestore(&ubl->ub_lock, flags); ++ if (vm_flags & VM_LOCKED) ++ uncharge_beancounter(ub, UB_LOCKEDPAGES, size); ++out_err: ++ return -ENOMEM; ++} ++ ++void ub_memory_uncharge(struct mm_struct *mm, unsigned long size, ++ unsigned vm_flags, struct file *vm_file) ++{ ++ struct user_beancounter *ub; ++ unsigned long flags; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return; ++ ++ size >>= PAGE_SHIFT; ++ ++ if (vm_flags & VM_LOCKED) ++ uncharge_beancounter(ub, UB_LOCKEDPAGES, size); ++ if (VM_UB_PRIVATE(vm_flags, vm_file)) { ++ for (; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __unused_privvm_dec_locked(ub, size); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ } ++} ++ ++int ub_locked_charge(struct mm_struct *mm, unsigned long size) ++{ ++ struct user_beancounter *ub; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return 0; ++ ++ return charge_beancounter(ub, UB_LOCKEDPAGES, ++ size >> PAGE_SHIFT, UB_HARD); ++} ++ ++void ub_locked_uncharge(struct mm_struct *mm, unsigned long size) ++{ ++ struct user_beancounter *ub; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return; ++ ++ uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT); ++} ++ ++int ub_lockedshm_charge(struct shmem_inode_info *shi, unsigned long size) ++{ ++ struct user_beancounter *ub; ++ ++ ub = shi->shmi_ub; ++ if (ub == NULL) ++ return 0; ++ ++ return charge_beancounter(ub, UB_LOCKEDPAGES, ++ size >> PAGE_SHIFT, UB_HARD); ++} ++ ++void ub_lockedshm_uncharge(struct shmem_inode_info *shi, unsigned long size) ++{ ++ struct user_beancounter *ub; ++ ++ ub = shi->shmi_ub; ++ if (ub == NULL) ++ return; ++ ++ uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT); ++} ++ ++ ++static inline void do_ub_tmpfs_respages_inc(struct user_beancounter *ub) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_tmpfs_respages++; ++ __ub_update_physpages(ub); ++ __ub_update_oomguarpages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++void ub_tmpfs_respages_inc(struct shmem_inode_info *shi) ++{ ++ struct user_beancounter *ub; ++ ++ for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent) ++ do_ub_tmpfs_respages_inc(ub); ++} ++ ++static inline void do_ub_tmpfs_respages_sub(struct user_beancounter *ub, ++ unsigned long size) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ /* catch possible overflow */ ++ if (ub->ub_tmpfs_respages < size) { ++ uncharge_warn(ub, UB_TMPFSPAGES, ++ size, ub->ub_tmpfs_respages); ++ size = ub->ub_tmpfs_respages; ++ } ++ ub->ub_tmpfs_respages -= size; ++ /* update values what is the most interesting */ ++ __ub_update_physpages(ub); ++ __ub_update_oomguarpages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++void ub_tmpfs_respages_sub(struct shmem_inode_info *shi, ++ unsigned long size) ++{ ++ struct user_beancounter *ub; ++ ++ for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent) ++ do_ub_tmpfs_respages_sub(ub, size); ++} ++ ++int ub_shmpages_charge(struct shmem_inode_info *shi, long size) ++{ ++ int ret; ++ unsigned long flags; ++ struct user_beancounter *ub; ++ ++ ub = shi->shmi_ub; ++ if (ub == NULL || size <= 0) ++ return 0; ++ ++ for (; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ret = __charge_beancounter_locked(ub, UB_SHMPAGES, size, UB_HARD); ++ if (ret == 0) ++ __ub_update_privvm(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return ret; ++} ++ ++void ub_shmpages_uncharge(struct shmem_inode_info *shi, long size) ++{ ++ unsigned long flags; ++ struct user_beancounter *ub; ++ ++ ub = shi->shmi_ub; ++ if (ub == NULL) ++ return; ++ ++ for (; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __uncharge_beancounter_locked(ub, UB_SHMPAGES, size); ++ __ub_update_privvm(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++#ifdef CONFIG_USER_SWAP_ACCOUNTING ++static inline void do_ub_swapentry_inc(struct user_beancounter *ub) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_swap_pages++; ++ __ub_update_oomguarpages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++void ub_swapentry_inc(struct swap_info_struct *si, pgoff_t num, ++ struct user_beancounter *ub) ++{ ++ si->swap_ubs[num] = get_beancounter(ub); ++ for (; ub != NULL; ub = ub->parent) ++ do_ub_swapentry_inc(ub); ++} ++EXPORT_SYMBOL(ub_swapentry_inc); ++ ++static inline void do_ub_swapentry_dec(struct user_beancounter *ub) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (ub->ub_swap_pages <= 0) ++ uncharge_warn(ub, UB_SWAPPAGES, 1, ub->ub_swap_pages); ++ else ++ ub->ub_swap_pages--; ++ __ub_update_oomguarpages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++void ub_swapentry_dec(struct swap_info_struct *si, pgoff_t num) ++{ ++ struct user_beancounter *ub, *ubp; ++ ++ ub = si->swap_ubs[num]; ++ si->swap_ubs[num] = NULL; ++ for (ubp = ub; ubp != NULL; ubp = ubp->parent) ++ do_ub_swapentry_dec(ubp); ++ put_beancounter(ub); ++} ++EXPORT_SYMBOL(ub_swapentry_dec); ++ ++int ub_swap_init(struct swap_info_struct *si, pgoff_t num) ++{ ++ struct user_beancounter **ubs; ++ ++ ubs = vmalloc(num * sizeof(struct user_beancounter *)); ++ if (ubs == NULL) ++ return -ENOMEM; ++ ++ memset(ubs, 0, num * sizeof(struct user_beancounter *)); ++ si->swap_ubs = ubs; ++ return 0; ++} ++ ++void ub_swap_fini(struct swap_info_struct *si) ++{ ++ if (si->swap_ubs) { ++ vfree(si->swap_ubs); ++ si->swap_ubs = NULL; ++ } ++} ++#endif ++ ++static int vmguar_enough_memory(struct vnotifier_block *self, ++ unsigned long event, void *arg, int old_ret) ++{ ++ struct user_beancounter *ub; ++ ++ if (event != VIRTINFO_ENOUGHMEM) ++ return old_ret; ++ ++ for (ub = current->mm->mm_ub; ub->parent != NULL; ub = ub->parent); ++ if (ub->ub_parms[UB_PRIVVMPAGES].held > ++ ub->ub_parms[UB_VMGUARPAGES].barrier) ++ return old_ret; ++ ++ return NOTIFY_OK; ++} ++ ++static struct vnotifier_block vmguar_notifier_block = { ++ .notifier_call = vmguar_enough_memory ++}; ++ ++static int __init init_vmguar_notifier(void) ++{ ++ virtinfo_notifier_register(VITYPE_GENERAL, &vmguar_notifier_block); ++ return 0; ++} ++ ++static void __exit fini_vmguar_notifier(void) ++{ ++ virtinfo_notifier_unregister(VITYPE_GENERAL, &vmguar_notifier_block); ++} ++ ++module_init(init_vmguar_notifier); ++module_exit(fini_vmguar_notifier); +diff -uprN linux-2.6.15.orig/kernel/ub/ub_proc.c linux-2.6.15-ve025stab014/kernel/ub/ub_proc.c +--- linux-2.6.15.orig/kernel/ub/ub_proc.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/ub/ub_proc.c 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,389 @@ ++/* ++ * linux/fs/proc/proc_ub.c ++ * ++ * Copyright (C) 1998-2000 Andrey V. Savochkin <saw@saw.sw.com.sg> ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * TODO: ++ * ++ * Changes: ++ */ ++ ++#include <linux/errno.h> ++#include <linux/sched.h> ++#include <linux/kernel.h> ++#include <linux/mm.h> ++#include <linux/proc_fs.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_hash.h> ++#include <ub/ub_debug.h> ++#include <ub/ub_page.h> ++ ++#include <asm/page.h> ++#include <asm/uaccess.h> ++ ++/* ++ * we have 8 format strings depending on: ++ * 1. BITS_PER_LONG ++ * 2. CONFIG_UBC_KEEP_UNUSED ++ * 3. resource number (see out_proc_beancounter) ++ */ ++ ++#ifdef CONFIG_UBC_KEEP_UNUSED ++#define REF_FORMAT "%5.5s %4i: %-12s " ++#define UID_HEAD_STR "uid ref" ++#else ++#define REF_FORMAT "%10.10s: %-12s " ++#define UID_HEAD_STR "uid" ++#endif ++#define REF2_FORMAT "%10s %-12s " ++ ++#if BITS_PER_LONG == 32 ++#define RES_FORMAT "%10lu %10lu %10lu %10lu %10lu" ++#define HEAD_FORMAT "%10s %10s %10s %10s %10s" ++#define UB_PROC_LINE_TEXT (10+2+12+1+10+1+10+1+10+1+10+1+10) ++#else ++#define RES_FORMAT "%20lu %20lu %20lu %20lu %20lu" ++#define HEAD_FORMAT "%20s %20s %20s %20s %20s" ++#define UB_PROC_LINE_TEXT (10+2+12+1+20+1+20+1+20+1+20+1+20) ++#endif ++ ++#define UB_PROC_LINE_LEN (UB_PROC_LINE_TEXT + 1) ++ ++static void out_proc_version(char *buf) ++{ ++ int len; ++ ++ len = sprintf(buf, "Version: 2.5"); ++ memset(buf + len, ' ', UB_PROC_LINE_TEXT - len); ++ buf[UB_PROC_LINE_TEXT] = '\n'; ++} ++ ++static void out_proc_head(char *buf) ++{ ++ sprintf(buf, REF2_FORMAT HEAD_FORMAT, ++ UID_HEAD_STR, "resource", "held", "maxheld", ++ "barrier", "limit", "failcnt"); ++ buf[UB_PROC_LINE_TEXT] = '\n'; ++} ++ ++static void out_proc_beancounter(char *buf, struct user_beancounter *ub, int r) ++{ ++ if (r == 0) { ++ char tmpbuf[64]; ++ print_ub_uid(ub, tmpbuf, sizeof(tmpbuf)); ++ sprintf(buf, REF_FORMAT RES_FORMAT, ++ tmpbuf, ++#ifdef CONFIG_UBC_KEEP_UNUSED ++ atomic_read(&ub->ub_refcount), ++#endif ++ ub_rnames[r], ub->ub_parms[r].held, ++ ub->ub_parms[r].maxheld, ub->ub_parms[r].barrier, ++ ub->ub_parms[r].limit, ub->ub_parms[r].failcnt); ++ } else ++ sprintf(buf, REF2_FORMAT RES_FORMAT, ++ "", ub_rnames[r], ++ ub->ub_parms[r].held, ub->ub_parms[r].maxheld, ++ ub->ub_parms[r].barrier, ub->ub_parms[r].limit, ++ ub->ub_parms[r].failcnt); ++ ++ buf[UB_PROC_LINE_TEXT] = '\n'; ++} ++ ++static int ub_accessible(struct user_beancounter *ub, ++ struct user_beancounter *exec_ub, ++ struct file *file) ++{ ++ struct user_beancounter *p, *q; ++ ++ for (p = exec_ub; p->parent != NULL; p = p->parent); ++ for (q = ub; q->parent != NULL; q = q->parent); ++ if (p != get_ub0() && q != p) ++ return 0; ++ if (ub->parent == NULL) ++ return 1; ++ return file->private_data == NULL ? 0 : 1; ++} ++ ++static ssize_t ub_proc_read(struct file *file, char *usrbuf, size_t len, ++ loff_t *poff) ++{ ++ ssize_t retval; ++ char *buf; ++ unsigned long flags; ++ int i, resource; ++ struct ub_hash_slot *slot; ++ struct user_beancounter *ub; ++ struct user_beancounter *exec_ub = get_exec_ub(); ++ loff_t n, off; ++ int rem, produced, job, tocopy; ++ const int is_capable = ++ (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)); ++ ++ retval = -ENOBUFS; ++ buf = (char *)__get_free_page(GFP_KERNEL); ++ if (buf == NULL) ++ goto out; ++ ++ retval = 0; ++ if (!is_capable) ++ goto out_free; ++ ++ off = *poff; ++ if (off < 0) /* can't happen, just in case */ ++ goto inval; ++ ++again: ++ i = 0; ++ slot = ub_hash; ++ n = off; /* The amount of data tp skip */ ++ produced = 0; ++ if (n < (UB_PROC_LINE_LEN * 2)) { ++ if (n < UB_PROC_LINE_LEN) { ++ out_proc_version(buf); ++ produced += UB_PROC_LINE_LEN; ++ n += UB_PROC_LINE_LEN; ++ } ++ out_proc_head(buf + produced); ++ produced += UB_PROC_LINE_LEN; ++ n += UB_PROC_LINE_LEN; ++ } ++ n -= (2 * UB_PROC_LINE_LEN); ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ while (1) { ++ for (ub = slot->ubh_beans; ++ ub != NULL && n >= (UB_RESOURCES * UB_PROC_LINE_LEN); ++ ub = ub->ub_next) ++ if (is_capable && ub_accessible(ub, exec_ub, file)) ++ n -= (UB_RESOURCES * UB_PROC_LINE_LEN); ++ if (ub != NULL || ++i >= UB_HASH_SIZE) ++ break; ++ ++slot; ++ } ++ rem = n; /* the amount of the data in the buffer to skip */ ++ job = PAGE_SIZE - UB_PROC_LINE_LEN + 1; /* end of buffer data */ ++ if (len < job - rem) ++ job = rem + len; ++ while (ub != NULL && produced < job) { ++ if (is_capable && ub_accessible(ub, exec_ub, file)) ++ for (resource = 0; ++ produced < job && resource < UB_RESOURCES; ++ resource++, produced += UB_PROC_LINE_LEN) ++ { ++ out_proc_beancounter(buf + produced, ++ ub, resource); ++ } ++ if (produced >= job) ++ break; ++ /* Find the next beancounter to produce more data. */ ++ ub = ub->ub_next; ++ while (ub == NULL && ++i < UB_HASH_SIZE) { ++ ++slot; ++ ub = slot->ubh_beans; ++ } ++ } ++ ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ub_debug(UBD_ALLOC, KERN_DEBUG "UB_PROC: produced %d, job %d, rem %d\n", ++ produced, job, rem); ++ ++ /* ++ * Temporary buffer `buf' contains `produced' bytes. ++ * Extract no more than `len' bytes at offset `rem'. ++ */ ++ if (produced <= rem) ++ goto out_free; ++ tocopy = produced - rem; ++ if (len < tocopy) ++ tocopy = len; ++ if (!tocopy) ++ goto out_free; ++ if (copy_to_user(usrbuf, buf + rem, tocopy)) ++ goto fault; ++ off += tocopy; /* can't overflow */ ++ *poff = off; ++ len -= tocopy; ++ retval += tocopy; ++ if (!len) ++ goto out_free; ++ usrbuf += tocopy; ++ goto again; ++ ++fault: ++ retval = -EFAULT; ++out_free: ++ free_page((unsigned long)buf); ++out: ++ return retval; ++ ++inval: ++ retval = -EINVAL; ++ goto out_free; ++} ++ ++static int ub_proc_open(struct inode *inode, struct file *file) ++{ ++ file->private_data = strcmp(file->f_dentry->d_name.name, ++ "user_beancounters") ? ++ (void *)-1 : NULL; ++ return 0; ++} ++ ++static struct file_operations ub_file_operations = { ++ .read = &ub_proc_read, ++ .open = &ub_proc_open ++}; ++ ++#ifdef CONFIG_UBC_DEBUG_KMEM ++#include <linux/seq_file.h> ++#include <linux/kmem_cache.h> ++ ++static void *ubd_start(struct seq_file *m, loff_t *pos) ++{ ++ loff_t n = *pos; ++ struct user_beancounter *ub; ++ long slot; ++ ++ spin_lock_irq(&ub_hash_lock); ++ for (slot = 0; slot < UB_HASH_SIZE; slot++) ++ for (ub = ub_hash[slot].ubh_beans; ub; ub = ub->ub_next) { ++ if (n == 0) { ++ m->private = (void *)slot; ++ return (void *)ub; ++ } ++ n--; ++ } ++ return NULL; ++} ++ ++static void *ubd_next(struct seq_file *m, void *p, loff_t *pos) ++{ ++ struct user_beancounter *ub; ++ long slot; ++ ++ ub = (struct user_beancounter *)p; ++ slot = (long)m->private; ++ ++ ++*pos; ++ ub = ub->ub_next; ++ while (1) { ++ for (; ub; ub = ub->ub_next) { ++ m->private = (void *)slot; ++ return (void *)ub; ++ } ++ slot++; ++ if (slot == UB_HASH_SIZE) ++ break; ++ ub = ub_hash[slot].ubh_beans; ++ } ++ return NULL; ++} ++ ++static void ubd_stop(struct seq_file *m, void *p) ++{ ++ spin_unlock_irq(&ub_hash_lock); ++} ++ ++#define PROC_LINE_FMT "\t%-17s\t%5lu\t%5lu\n" ++ ++static int ubd_show(struct seq_file *m, void *p) ++{ ++ struct user_beancounter *ub; ++ struct ub_cache_counter *cc; ++ long pages, vmpages, pbc, swap, unmap; ++ int i; ++ char id[64]; ++ ++ ub = (struct user_beancounter *)p; ++ print_ub_uid(ub, id, sizeof(id)); ++ seq_printf(m, "%s:%d\n", id, atomic_read(&ub->ub_refcount)); ++ ++ pages = vmpages = pbc = swap = unmap = 0; ++ for (i = 0; i < NR_CPUS; i++) { ++ pages += ub->ub_stat[i].pages_charged; ++ vmpages += ub->ub_stat[i].vmalloc_charged; ++ pbc += ub->ub_stat[i].pbcs; ++ swap += ub->ub_stat[i].swapin; ++ unmap += ub->ub_stat[i].unmap; ++ } ++ if (pages < 0) ++ pages = 0; ++ if (vmpages < 0) ++ vmpages = 0; ++ seq_printf(m, PROC_LINE_FMT, "pages", pages, PAGE_SIZE); ++ seq_printf(m, PROC_LINE_FMT, "vmalloced", vmpages, PAGE_SIZE); ++ ++ seq_printf(m, PROC_LINE_FMT, ub_rnames[UB_UNUSEDPRIVVM], ++ ub->ub_unused_privvmpages, PAGE_SIZE); ++ seq_printf(m, PROC_LINE_FMT, ub_rnames[UB_TMPFSPAGES], ++ ub->ub_tmpfs_respages, PAGE_SIZE); ++ seq_printf(m, PROC_LINE_FMT, ub_rnames[UB_SWAPPAGES], ++ ub->ub_swap_pages, PAGE_SIZE); ++ seq_printf(m, PROC_LINE_FMT, "pbcs", pbc, ++ (unsigned long)sizeof(struct page_beancounter)); ++ ++ seq_printf(m, PROC_LINE_FMT, "swapin", swap, 0UL); ++ seq_printf(m, PROC_LINE_FMT, "unmap", unmap, 0UL); ++ /* interrupts are disabled by locking ub_hash_lock */ ++ spin_lock(&cc_lock); ++ list_for_each_entry (cc, &ub->ub_cclist, ulist) { ++ kmem_cache_t *cachep; ++ ++ cachep = cc->cachep; ++ seq_printf(m, PROC_LINE_FMT, ++ cachep->name, ++ cc->counter, ++ (unsigned long)cachep->objuse); ++ } ++ spin_unlock(&cc_lock); ++ return 0; ++} ++ ++static struct seq_operations kmemdebug_op = { ++ .start = ubd_start, ++ .next = ubd_next, ++ .stop = ubd_stop, ++ .show = ubd_show, ++}; ++ ++static int kmem_debug_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &kmemdebug_op); ++} ++ ++static struct file_operations kmem_debug_ops = { ++ .open = kmem_debug_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++#endif ++ ++void __init ub_init_proc(void) ++{ ++ struct proc_dir_entry *entry; ++ ++ entry = create_proc_entry("user_beancounters", S_IRUGO, NULL); ++ if (entry) ++ entry->proc_fops = &ub_file_operations; ++ else ++ panic("Can't create /proc/user_beancounters entry!\n"); ++ ++ entry = create_proc_entry("user_beancounters_sub", S_IRUGO, NULL); ++ if (entry) ++ entry->proc_fops = &ub_file_operations; ++ else ++ panic("Can't create /proc/user_beancounters2 entry!\n"); ++ ++#ifdef CONFIG_UBC_DEBUG_KMEM ++ entry = create_proc_entry("user_beancounters_debug", S_IRUGO, NULL); ++ if (entry) ++ entry->proc_fops = &kmem_debug_ops; ++ else ++ panic("Can't create /proc/user_beancounters_debug entry!\n"); ++#endif ++} +diff -uprN linux-2.6.15.orig/kernel/ub/ub_stat.c linux-2.6.15-ve025stab014/kernel/ub/ub_stat.c +--- linux-2.6.15.orig/kernel/ub/ub_stat.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/ub/ub_stat.c 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,465 @@ ++/* ++ * kernel/ub/ub_stat.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/timer.h> ++#include <linux/sched.h> ++#include <linux/init.h> ++#include <linux/jiffies.h> ++#include <linux/list.h> ++#include <linux/errno.h> ++#include <linux/suspend.h> ++ ++#include <asm/uaccess.h> ++#include <asm/param.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_hash.h> ++#include <ub/ub_stat.h> ++ ++static spinlock_t ubs_notify_lock = SPIN_LOCK_UNLOCKED; ++static LIST_HEAD(ubs_notify_list); ++static long ubs_min_interval; ++static ubstattime_t ubs_start_time, ubs_end_time; ++static struct timer_list ubs_timer; ++ ++static int ubstat_get_list(void *buf, long size) ++{ ++ int retval; ++ unsigned long flags; ++ int slotnr; ++ struct ub_hash_slot *slot; ++ struct user_beancounter *ub, *last_ub; ++ long *page, *ptr, *end; ++ int len; ++ ++ page = (long *)__get_free_page(GFP_KERNEL); ++ if (page == NULL) ++ return -ENOMEM; ++ ++ retval = 0; ++ slotnr = 0; ++ slot = ub_hash; ++ last_ub = NULL; ++ while (1) { ++ ptr = page; ++ end = page + PAGE_SIZE / sizeof(*ptr); ++ ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ if (last_ub == NULL) ++ ub = slot->ubh_beans; ++ else ++ ub = last_ub->ub_next; ++ while (1) { ++ for (; ub != NULL; ub = ub->ub_next) { ++ if (ub->parent != NULL) ++ continue; ++ *ptr++ = ub->ub_uid; ++ if (ptr == end) ++ break; ++ } ++ if (ptr == end) ++ break; ++ ++slot; ++ if (++slotnr >= UB_HASH_SIZE) ++ break; ++ ub = slot->ubh_beans; ++ } ++ if (ptr == page) ++ goto out_unlock; ++ if (ub != NULL) ++ get_beancounter(ub); ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ++ if (last_ub != NULL) ++ put_beancounter(last_ub); ++ last_ub = ub; /* last visited beancounter in the slot */ ++ ++ len = min_t(long, (ptr - page) * sizeof(*ptr), size); ++ if (copy_to_user(buf, page, len)) { ++ retval = -EFAULT; ++ break; ++ } ++ retval += len; ++ if (len < PAGE_SIZE) ++ break; ++ buf += len; ++ size -= len; ++ } ++out: ++ if (last_ub != NULL) ++ put_beancounter(last_ub); ++ free_page((unsigned long)page); ++ return retval; ++ ++out_unlock: ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ goto out; ++} ++ ++static int ubstat_gettime(void *buf, long size) ++{ ++ ubgettime_t data; ++ int retval; ++ ++ spin_lock(&ubs_notify_lock); ++ data.start_time = ubs_start_time; ++ data.end_time = ubs_end_time; ++ data.cur_time = ubs_start_time + (jiffies - ubs_start_time * HZ) / HZ; ++ spin_unlock(&ubs_notify_lock); ++ ++ retval = min_t(long, sizeof(data), size); ++ if (copy_to_user(buf, &data, retval)) ++ retval = -EFAULT; ++ return retval; ++} ++ ++static int ubstat_do_read_one(struct user_beancounter *ub, int res, void *kbuf) ++{ ++ struct { ++ ubstattime_t start_time; ++ ubstattime_t end_time; ++ ubstatparm_t param[1]; ++ } *data; ++ ++ data = kbuf; ++ data->start_time = ubs_start_time; ++ data->end_time = ubs_end_time; ++ ++ data->param[0].maxheld = ub->ub_store[res].maxheld; ++ data->param[0].failcnt = ub->ub_store[res].failcnt; ++ ++ return sizeof(*data); ++} ++ ++static int ubstat_do_read_all(struct user_beancounter *ub, void *kbuf, int size) ++{ ++ int wrote; ++ struct { ++ ubstattime_t start_time; ++ ubstattime_t end_time; ++ ubstatparm_t param[UB_RESOURCES]; ++ } *data; ++ int resource; ++ ++ data = kbuf; ++ data->start_time = ubs_start_time; ++ data->end_time = ubs_end_time; ++ wrote = sizeof(data->start_time) + sizeof(data->end_time); ++ ++ for (resource = 0; resource < UB_RESOURCES; resource++) { ++ if (size < wrote + sizeof(data->param[resource])) ++ break; ++ data->param[resource].maxheld = ub->ub_store[resource].maxheld; ++ data->param[resource].failcnt = ub->ub_store[resource].failcnt; ++ wrote += sizeof(data->param[resource]); ++ } ++ ++ return wrote; ++} ++ ++static int ubstat_do_read_full(struct user_beancounter *ub, void *kbuf, ++ int size) ++{ ++ int wrote; ++ struct { ++ ubstattime_t start_time; ++ ubstattime_t end_time; ++ ubstatparmf_t param[UB_RESOURCES]; ++ } *data; ++ int resource; ++ ++ data = kbuf; ++ data->start_time = ubs_start_time; ++ data->end_time = ubs_end_time; ++ wrote = sizeof(data->start_time) + sizeof(data->end_time); ++ ++ for (resource = 0; resource < UB_RESOURCES; resource++) { ++ if (size < wrote + sizeof(data->param[resource])) ++ break; ++ /* The beginning of ubstatparmf_t matches struct ubparm. */ ++ memcpy(&data->param[resource], &ub->ub_store[resource], ++ sizeof(ub->ub_store[resource])); ++ data->param[resource].__unused1 = 0; ++ data->param[resource].__unused2 = 0; ++ wrote += sizeof(data->param[resource]); ++ } ++ return wrote; ++} ++ ++static int ubstat_get_stat(struct user_beancounter *ub, long cmd, ++ void *buf, long size) ++{ ++ void *kbuf; ++ int retval; ++ ++ kbuf = (void *)__get_free_page(GFP_KERNEL); ++ if (kbuf == NULL) ++ return -ENOMEM; ++ ++ spin_lock(&ubs_notify_lock); ++ switch (UBSTAT_CMD(cmd)) { ++ case UBSTAT_READ_ONE: ++ retval = -EINVAL; ++ if (UBSTAT_PARMID(cmd) >= UB_RESOURCES) ++ break; ++ retval = ubstat_do_read_one(ub, ++ UBSTAT_PARMID(cmd), kbuf); ++ break; ++ case UBSTAT_READ_ALL: ++ retval = ubstat_do_read_all(ub, kbuf, PAGE_SIZE); ++ break; ++ case UBSTAT_READ_FULL: ++ retval = ubstat_do_read_full(ub, kbuf, PAGE_SIZE); ++ break; ++ default: ++ retval = -EINVAL; ++ } ++ spin_unlock(&ubs_notify_lock); ++ ++ if (retval > 0) { ++ retval = min_t(long, retval, size); ++ if (copy_to_user(buf, kbuf, retval)) ++ retval = -EFAULT; ++ } ++ ++ free_page((unsigned long)kbuf); ++ return retval; ++} ++ ++static int ubstat_handle_notifrq(ubnotifrq_t *req) ++{ ++ int retval; ++ struct ub_stat_notify *new_notify; ++ struct list_head *entry; ++ struct task_struct *tsk_to_free; ++ ++ new_notify = kmalloc(sizeof(new_notify), GFP_KERNEL); ++ if (new_notify == NULL) ++ return -ENOMEM; ++ ++ tsk_to_free = NULL; ++ INIT_LIST_HEAD(&new_notify->list); ++ ++ spin_lock(&ubs_notify_lock); ++ list_for_each(entry, &ubs_notify_list) { ++ struct ub_stat_notify *notify; ++ ++ notify = list_entry(entry, struct ub_stat_notify, list); ++ if (notify->task == current) { ++ kfree(new_notify); ++ new_notify = notify; ++ break; ++ } ++ } ++ ++ retval = -EINVAL; ++ if (req->maxinterval < 1) ++ goto out_unlock; ++ if (req->maxinterval > TIME_MAX_SEC) ++ req->maxinterval = TIME_MAX_SEC; ++ if (req->maxinterval < ubs_min_interval) { ++ unsigned long dif; ++ ++ ubs_min_interval = req->maxinterval; ++ dif = (ubs_timer.expires - jiffies + HZ - 1) / HZ; ++ if (dif > req->maxinterval) ++ mod_timer(&ubs_timer, ++ ubs_timer.expires - ++ (dif - req->maxinterval) * HZ); ++ } ++ ++ if (entry != &ubs_notify_list) { ++ list_del(&new_notify->list); ++ tsk_to_free = new_notify->task; ++ } ++ if (req->signum) { ++ new_notify->task = current; ++ get_task_struct(new_notify->task); ++ new_notify->signum = req->signum; ++ list_add(&new_notify->list, &ubs_notify_list); ++ } else ++ kfree(new_notify); ++ retval = 0; ++out_unlock: ++ spin_unlock(&ubs_notify_lock); ++ if (tsk_to_free != NULL) ++ put_task_struct(tsk_to_free); ++ return retval; ++} ++ ++/* ++ * former sys_ubstat ++ */ ++long do_ubstat(int func, unsigned long arg1, unsigned long arg2, void *buf, ++ long size) ++{ ++ int retval; ++ struct user_beancounter *ub; ++ ++ if (func == UBSTAT_UBPARMNUM) ++ return UB_RESOURCES; ++ if (func == UBSTAT_UBLIST) ++ return ubstat_get_list(buf, size); ++ if (!(capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))) ++ return -EPERM; ++ ++ if (func == UBSTAT_GETTIME) { ++ retval = ubstat_gettime(buf, size); ++ goto notify; ++ } ++ ++ ub = get_exec_ub(); ++ if (ub != NULL && ub->ub_uid == arg1) ++ get_beancounter(ub); ++ else /* FIXME must be if (ve_is_super) */ ++ ub = get_beancounter_byuid(arg1, 0); ++ ++ if (ub == NULL) ++ return -ESRCH; ++ ++ retval = ubstat_get_stat(ub, func, buf, size); ++ put_beancounter(ub); ++notify: ++ /* Handle request for notification */ ++ if (retval >= 0) { ++ ubnotifrq_t notifrq; ++ int err; ++ ++ err = -EFAULT; ++ if (!copy_from_user(¬ifrq, (void *)arg2, sizeof(notifrq))) ++ err = ubstat_handle_notifrq(¬ifrq); ++ if (err) ++ retval = err; ++ } ++ ++ return retval; ++} ++ ++static void ubstat_save_onestat(struct user_beancounter *ub) ++{ ++ int resource; ++ ++ /* called with local irq disabled */ ++ spin_lock(&ub->ub_lock); ++ for (resource = 0; resource < UB_RESOURCES; resource++) { ++ memcpy(&ub->ub_store[resource], &ub->ub_parms[resource], ++ sizeof(struct ubparm)); ++ ub->ub_parms[resource].minheld = ++ ub->ub_parms[resource].maxheld = ++ ub->ub_parms[resource].held; ++ } ++ spin_unlock(&ub->ub_lock); ++} ++ ++static void ubstat_save_statistics(void) ++{ ++ unsigned long flags; ++ int i; ++ struct user_beancounter *ub; ++ ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ for_each_beancounter(i, ub) ++ ubstat_save_onestat(ub); ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++} ++ ++static void ubstatd_timeout(unsigned long __data) ++{ ++ struct task_struct *p; ++ ++ p = (struct task_struct *) __data; ++ wake_up_process(p); ++} ++ ++/* ++ * Safe wrapper for send_sig. It prevents a race with release_task ++ * for sighand. ++ * Should be called under tasklist_lock. ++ */ ++static void task_send_sig(struct ub_stat_notify *notify) ++{ ++ if (likely(notify->task->sighand != NULL)) ++ send_sig(notify->signum, notify->task, 1); ++} ++ ++static inline void do_notifies(void) ++{ ++ LIST_HEAD(notif_free_list); ++ struct ub_stat_notify *notify; ++ struct ub_stat_notify *tmp; ++ ++ spin_lock(&ubs_notify_lock); ++ ubs_start_time = ubs_end_time; ++ /* ++ * the expression below relies on time being unsigned long and ++ * arithmetic promotion rules ++ */ ++ ubs_end_time += (ubs_timer.expires - ubs_start_time * HZ) / HZ; ++ mod_timer(&ubs_timer, ubs_timer.expires + ubs_min_interval * HZ); ++ ubs_min_interval = TIME_MAX_SEC; ++ /* save statistics accumulated for the interval */ ++ ubstat_save_statistics(); ++ /* send signals */ ++ read_lock(&tasklist_lock); ++ while (!list_empty(&ubs_notify_list)) { ++ notify = list_entry(ubs_notify_list.next, ++ struct ub_stat_notify, list); ++ task_send_sig(notify); ++ list_del(¬ify->list); ++ list_add(¬ify->list, ¬if_free_list); ++ } ++ read_unlock(&tasklist_lock); ++ spin_unlock(&ubs_notify_lock); ++ ++ list_for_each_entry_safe(notify, tmp, ¬if_free_list, list) { ++ put_task_struct(notify->task); ++ kfree(notify); ++ } ++} ++ ++/* ++ * Kernel thread ++ */ ++static int ubstatd(void *unused) ++{ ++ /* daemonize call will take care of signals */ ++ daemonize("ubstatd"); ++ ++ ubs_timer.data = (unsigned long)current; ++ ubs_timer.function = ubstatd_timeout; ++ add_timer(&ubs_timer); ++ ++ while (1) { ++ set_task_state(current, TASK_INTERRUPTIBLE); ++ if (time_after(ubs_timer.expires, jiffies)) { ++ schedule(); ++ try_to_freeze(); ++ continue; ++ } ++ ++ __set_task_state(current, TASK_RUNNING); ++ do_notifies(); ++ } ++ return 0; ++} ++ ++static int __init ubstatd_init(void) ++{ ++ init_timer(&ubs_timer); ++ ubs_timer.expires = TIME_MAX_JIF; ++ ubs_min_interval = TIME_MAX_SEC; ++ ubs_start_time = ubs_end_time = 0; ++ ++ kernel_thread(ubstatd, NULL, 0); ++ return 0; ++} ++ ++module_init(ubstatd_init); +diff -uprN linux-2.6.15.orig/kernel/ub/ub_sys.c linux-2.6.15-ve025stab014/kernel/ub/ub_sys.c +--- linux-2.6.15.orig/kernel/ub/ub_sys.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/ub/ub_sys.c 2006-01-27 14:48:06.000000000 +0300 +@@ -0,0 +1,148 @@ ++/* ++ * kernel/ub/ub_sys.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <asm/uaccess.h> ++ ++#include <ub/beancounter.h> ++ ++#ifndef CONFIG_USER_RESOURCE ++asmlinkage long sys_getluid(void) ++{ ++ return -ENOSYS; ++} ++ ++asmlinkage long sys_setluid(uid_t uid) ++{ ++ return -ENOSYS; ++} ++ ++asmlinkage long sys_setublimit(uid_t uid, unsigned long resource, ++ unsigned long *limits) ++{ ++ return -ENOSYS; ++} ++ ++asmlinkage long sys_ubstat(int func, unsigned long arg1, unsigned long arg2, ++ void *buf, long size) ++{ ++ return -ENOSYS; ++} ++#else /* CONFIG_USER_RESOURCE */ ++ ++/* ++ * The (rather boring) getluid syscall ++ */ ++asmlinkage long sys_getluid(void) ++{ ++ struct user_beancounter *ub; ++ ++ ub = get_exec_ub(); ++ if (ub == NULL) ++ return -EINVAL; ++ ++ return ub->ub_uid; ++} ++ ++/* ++ * The setluid syscall ++ */ ++asmlinkage long sys_setluid(uid_t uid) ++{ ++ struct user_beancounter *ub; ++ struct task_beancounter *task_bc; ++ int error; ++ ++ task_bc = ¤t->task_bc; ++ ++ /* You may not disown a setluid */ ++ error = -EINVAL; ++ if (uid == (uid_t)-1) ++ goto out; ++ ++ /* You may only set an ub as root */ ++ error = -EPERM; ++ if (!capable(CAP_SETUID)) ++ goto out; ++ ++ /* Ok - set up a beancounter entry for this user */ ++ error = -ENOBUFS; ++ ub = get_beancounter_byuid(uid, 1); ++ if (ub == NULL) ++ goto out; ++ ++ ub_debug(UBD_ALLOC | UBD_LIMIT, "setluid, bean %p (count %d) " ++ "for %.20s pid %d\n", ++ ub, atomic_read(&ub->ub_refcount), ++ current->comm, current->pid); ++ /* install bc */ ++ put_beancounter(task_bc->exec_ub); ++ task_bc->exec_ub = ub; ++ put_beancounter(task_bc->fork_sub); ++ task_bc->fork_sub = get_beancounter(ub); ++ error = 0; ++out: ++ return error; ++} ++ ++/* ++ * The setbeanlimit syscall ++ */ ++asmlinkage long sys_setublimit(uid_t uid, unsigned long resource, ++ unsigned long *limits) ++{ ++ int error; ++ unsigned long flags; ++ struct user_beancounter *ub; ++ unsigned long new_limits[2]; ++ ++ error = -EPERM; ++ if(!capable(CAP_SYS_RESOURCE)) ++ goto out; ++ ++ error = -EINVAL; ++ if (resource >= UB_RESOURCES) ++ goto out; ++ ++ error = -EFAULT; ++ if (copy_from_user(&new_limits, limits, sizeof(new_limits))) ++ goto out; ++ ++ error = -EINVAL; ++ if (new_limits[0] > UB_MAXVALUE || new_limits[1] > UB_MAXVALUE) ++ goto out; ++ ++ error = -ENOENT; ++ ub = get_beancounter_byuid(uid, 0); ++ if (ub == NULL) { ++ ub_debug(UBD_LIMIT, "No login bc for uid %d\n", uid); ++ goto out; ++ } ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_parms[resource].barrier = new_limits[0]; ++ ub->ub_parms[resource].limit = new_limits[1]; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ put_beancounter(ub); ++ ++ error = 0; ++out: ++ return error; ++} ++ ++extern long do_ubstat(int func, unsigned long arg1, unsigned long arg2, ++ void *buf, long size); ++asmlinkage long sys_ubstat(int func, unsigned long arg1, unsigned long arg2, ++ void *buf, long size) ++{ ++ return do_ubstat(func, arg1, arg2, buf, size); ++} ++#endif +diff -uprN linux-2.6.15.orig/kernel/user.c linux-2.6.15-ve025stab014/kernel/user.c +--- linux-2.6.15.orig/kernel/user.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/user.c 2006-01-27 14:48:08.000000000 +0300 +@@ -23,7 +23,20 @@ + #define UIDHASH_SZ (1 << UIDHASH_BITS) + #define UIDHASH_MASK (UIDHASH_SZ - 1) + #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) +-#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) ++#define __uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) ++ ++#ifdef CONFIG_VE ++#define UIDHASH_MASK_VE (UIDHASH_SZ_VE - 1) ++#define __uidhashfn_ve(uid) (((uid >> UIDHASH_BITS_VE) ^ uid) & \ ++ UIDHASH_MASK_VE) ++#define __uidhashentry_ve(uid, envid) ((envid)->uidhash_table + \ ++ __uidhashfn_ve(uid)) ++#define uidhashentry_ve(uid) (ve_is_super(get_exec_env()) ? \ ++ __uidhashentry(uid) : \ ++ __uidhashentry_ve(uid, get_exec_env())) ++#else ++#define uidhashentry_ve(uid) __uidhashentry(uid) ++#endif + + static kmem_cache_t *uid_cachep; + static struct list_head uidhash_table[UIDHASH_SZ]; +@@ -84,7 +97,7 @@ struct user_struct *find_user(uid_t uid) + struct user_struct *ret; + + spin_lock(&uidhash_lock); +- ret = uid_hash_find(uid, uidhashentry(uid)); ++ ret = uid_hash_find(uid, uidhashentry_ve(uid)); + spin_unlock(&uidhash_lock); + return ret; + } +@@ -102,7 +115,7 @@ void free_uid(struct user_struct *up) + + struct user_struct * alloc_uid(uid_t uid) + { +- struct list_head *hashent = uidhashentry(uid); ++ struct list_head *hashent = uidhashentry_ve(uid); + struct user_struct *up; + + spin_lock(&uidhash_lock); +@@ -177,14 +190,14 @@ static int __init uid_cache_init(void) + int n; + + uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), +- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); ++ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); + + for(n = 0; n < UIDHASH_SZ; ++n) + INIT_LIST_HEAD(uidhash_table + n); + + /* Insert the root user immediately (init already runs as root) */ + spin_lock(&uidhash_lock); +- uid_hash_insert(&root_user, uidhashentry(0)); ++ uid_hash_insert(&root_user, __uidhashentry(0)); + spin_unlock(&uidhash_lock); + + return 0; +diff -uprN linux-2.6.15.orig/kernel/ve.c linux-2.6.15-ve025stab014/kernel/ve.c +--- linux-2.6.15.orig/kernel/ve.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/ve.c 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,177 @@ ++/* ++ * linux/kernel/ve.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++/* ++ * 've.c' helper file performing VE sub-system initialization ++ */ ++ ++#include <linux/sched.h> ++#include <linux/delay.h> ++#include <linux/capability.h> ++#include <linux/ve.h> ++#include <linux/smp_lock.h> ++#include <linux/init.h> ++ ++#include <linux/errno.h> ++#include <linux/unistd.h> ++#include <linux/slab.h> ++#include <linux/sys.h> ++#include <linux/kdev_t.h> ++#include <linux/termios.h> ++#include <linux/tty_driver.h> ++#include <linux/netdevice.h> ++#include <linux/utsname.h> ++#include <linux/proc_fs.h> ++#include <linux/kernel_stat.h> ++#include <linux/module.h> ++#include <linux/rcupdate.h> ++#include <linux/ve_proto.h> ++#include <linux/ve_owner.h> ++#include <linux/devpts_fs.h> ++ ++#include <linux/nfcalls.h> ++ ++unsigned long vz_rstamp = 0x37e0f59d; ++ ++struct module no_module = { .state = MODULE_STATE_GOING }; ++EXPORT_SYMBOL(no_module); ++ ++#ifdef CONFIG_VE ++ ++DCL_VE_OWNER(SKB, struct sk_buff, owner_env) ++DCL_VE_OWNER(SK, struct sock, sk_owner_env) ++DCL_VE_OWNER(TW, struct tcp_tw_bucket, tw_owner_env) ++DCL_VE_OWNER(FILP, struct file, owner_env) ++DCL_VE_OWNER(FSTYPE, struct file_system_type, owner_env) ++ ++#if defined(CONFIG_VE_IPTABLES) ++INIT_KSYM_MODULE(ip_tables); ++INIT_KSYM_MODULE(iptable_filter); ++INIT_KSYM_MODULE(iptable_mangle); ++INIT_KSYM_MODULE(ipt_limit); ++INIT_KSYM_MODULE(ipt_multiport); ++INIT_KSYM_MODULE(ipt_tos); ++INIT_KSYM_MODULE(ipt_TOS); ++INIT_KSYM_MODULE(ipt_REJECT); ++INIT_KSYM_MODULE(ipt_TCPMSS); ++INIT_KSYM_MODULE(ipt_tcpmss); ++INIT_KSYM_MODULE(ipt_ttl); ++INIT_KSYM_MODULE(ipt_LOG); ++INIT_KSYM_MODULE(ipt_length); ++INIT_KSYM_MODULE(ip_conntrack); ++INIT_KSYM_MODULE(ip_conntrack_ftp); ++INIT_KSYM_MODULE(ip_conntrack_irc); ++INIT_KSYM_MODULE(ipt_conntrack); ++INIT_KSYM_MODULE(ipt_state); ++INIT_KSYM_MODULE(ipt_helper); ++INIT_KSYM_MODULE(ip_nat); ++INIT_KSYM_MODULE(iptable_nat); ++INIT_KSYM_MODULE(ip_nat_ftp); ++INIT_KSYM_MODULE(ip_nat_irc); ++ ++INIT_KSYM_CALL(int, init_netfilter, (void)); ++INIT_KSYM_CALL(int, init_iptables, (void)); ++INIT_KSYM_CALL(int, init_iptable_filter, (void)); ++INIT_KSYM_CALL(int, init_iptable_mangle, (void)); ++INIT_KSYM_CALL(int, init_iptable_limit, (void)); ++INIT_KSYM_CALL(int, init_iptable_multiport, (void)); ++INIT_KSYM_CALL(int, init_iptable_tos, (void)); ++INIT_KSYM_CALL(int, init_iptable_TOS, (void)); ++INIT_KSYM_CALL(int, init_iptable_REJECT, (void)); ++INIT_KSYM_CALL(int, init_iptable_TCPMSS, (void)); ++INIT_KSYM_CALL(int, init_iptable_tcpmss, (void)); ++INIT_KSYM_CALL(int, init_iptable_ttl, (void)); ++INIT_KSYM_CALL(int, init_iptable_LOG, (void)); ++INIT_KSYM_CALL(int, init_iptable_length, (void)); ++INIT_KSYM_CALL(int, init_iptable_conntrack, (void)); ++INIT_KSYM_CALL(int, init_iptable_ftp, (void)); ++INIT_KSYM_CALL(int, init_iptable_irc, (void)); ++INIT_KSYM_CALL(int, init_iptable_conntrack_match, (void)); ++INIT_KSYM_CALL(int, init_iptable_state, (void)); ++INIT_KSYM_CALL(int, init_iptable_helper, (void)); ++INIT_KSYM_CALL(int, ip_nat_init, (void)); ++INIT_KSYM_CALL(int, init_iptable_nat, (void)); ++INIT_KSYM_CALL(int, init_iptable_nat_ftp, (void)); ++INIT_KSYM_CALL(int, init_iptable_nat_irc, (void)); ++INIT_KSYM_CALL(void, fini_iptable_nat_irc, (void)); ++INIT_KSYM_CALL(void, fini_iptable_nat_ftp, (void)); ++INIT_KSYM_CALL(void, fini_iptable_nat, (void)); ++INIT_KSYM_CALL(void, ip_nat_cleanup, (void)); ++INIT_KSYM_CALL(void, fini_iptable_helper, (void)); ++INIT_KSYM_CALL(void, fini_iptable_state, (void)); ++INIT_KSYM_CALL(void, fini_iptable_conntrack_match, (void)); ++INIT_KSYM_CALL(void, fini_iptable_irc, (void)); ++INIT_KSYM_CALL(void, fini_iptable_ftp, (void)); ++INIT_KSYM_CALL(void, fini_iptable_conntrack, (void)); ++INIT_KSYM_CALL(void, fini_iptable_length, (void)); ++INIT_KSYM_CALL(void, fini_iptable_LOG, (void)); ++INIT_KSYM_CALL(void, fini_iptable_ttl, (void)); ++INIT_KSYM_CALL(void, fini_iptable_tcpmss, (void)); ++INIT_KSYM_CALL(void, fini_iptable_TCPMSS, (void)); ++INIT_KSYM_CALL(void, fini_iptable_REJECT, (void)); ++INIT_KSYM_CALL(void, fini_iptable_TOS, (void)); ++INIT_KSYM_CALL(void, fini_iptable_tos, (void)); ++INIT_KSYM_CALL(void, fini_iptable_multiport, (void)); ++INIT_KSYM_CALL(void, fini_iptable_limit, (void)); ++INIT_KSYM_CALL(void, fini_iptable_filter, (void)); ++INIT_KSYM_CALL(void, fini_iptable_mangle, (void)); ++INIT_KSYM_CALL(void, fini_iptables, (void)); ++INIT_KSYM_CALL(void, fini_netfilter, (void)); ++ ++INIT_KSYM_CALL(void, ipt_flush_table, (struct ipt_table *table)); ++#endif ++ ++#ifdef CONFIG_VE_CALLS_MODULE ++INIT_KSYM_MODULE(vzmon); ++INIT_KSYM_CALL(int, real_get_device_perms_ve, ++ (int dev_type, dev_t dev, int access_mode)); ++INIT_KSYM_CALL(void, real_do_env_cleanup, (struct ve_struct *env)); ++INIT_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env)); ++INIT_KSYM_CALL(void, real_update_load_avg_ve, (void)); ++ ++int get_device_perms_ve(int dev_type, dev_t dev, int access_mode) ++{ ++ return KSYMSAFECALL(int, vzmon, real_get_device_perms_ve, ++ (dev_type, dev, access_mode)); ++} ++EXPORT_SYMBOL(get_device_perms_ve); ++ ++void do_env_cleanup(struct ve_struct *env) ++{ ++ KSYMSAFECALL_VOID(vzmon, real_do_env_cleanup, (env)); ++} ++ ++void do_env_free(struct ve_struct *env) ++{ ++ KSYMSAFECALL_VOID(vzmon, real_do_env_free, (env)); ++} ++EXPORT_SYMBOL(do_env_free); ++ ++void do_update_load_avg_ve(void) ++{ ++ KSYMSAFECALL_VOID(vzmon, real_update_load_avg_ve, ()); ++} ++#endif ++ ++struct ve_struct ve0 = { ++ .utsname = &system_utsname, ++ .vetask_lh = LIST_HEAD_INIT(ve0.vetask_lh), ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ ._net_dev_tail = &ve0._net_dev_base, ++ .ifindex = -1, ++#endif ++#ifdef CONFIG_UNIX98_PTYS ++ .devpts_config = &devpts_config, ++#endif ++}; ++ ++EXPORT_SYMBOL(ve0); ++ ++#endif /* CONFIG_VE */ +diff -uprN linux-2.6.15.orig/kernel/vecalls.c linux-2.6.15-ve025stab014/kernel/vecalls.c +--- linux-2.6.15.orig/kernel/vecalls.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/vecalls.c 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,3206 @@ ++/* ++ * linux/kernel/vecalls.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ */ ++ ++/* ++ * 'vecalls.c' is file with basic VE support. It provides basic primities ++ * along with initialization script ++ */ ++ ++#include <linux/sched.h> ++#include <linux/delay.h> ++#include <linux/capability.h> ++#include <linux/ve.h> ++#include <linux/smp_lock.h> ++#include <linux/init.h> ++#include <linux/list.h> ++#include <linux/ve_owner.h> ++#include <linux/errno.h> ++#include <linux/unistd.h> ++#include <linux/slab.h> ++#include <linux/vmalloc.h> ++#include <linux/sys.h> ++#include <linux/fs.h> ++#include <linux/namespace.h> ++#include <linux/termios.h> ++#include <linux/tty_driver.h> ++#include <linux/netdevice.h> ++#include <linux/wait.h> ++#include <linux/inetdevice.h> ++#include <linux/utsname.h> ++#include <linux/sysctl.h> ++#include <linux/proc_fs.h> ++#include <linux/seq_file.h> ++#include <linux/kernel_stat.h> ++#include <linux/module.h> ++#include <linux/suspend.h> ++#include <linux/rcupdate.h> ++#include <linux/in.h> ++#include <linux/major.h> ++#include <linux/kdev_t.h> ++#include <linux/idr.h> ++#include <linux/inetdevice.h> ++#include <net/pkt_sched.h> ++#include <linux/divert.h> ++#include <ub/beancounter.h> ++ ++#include <net/route.h> ++#include <net/ip_fib.h> ++ ++#include <linux/ve_proto.h> ++#include <linux/venet.h> ++#include <linux/vzctl.h> ++#include <linux/vzcalluser.h> ++#ifdef CONFIG_FAIRSCHED ++#include <linux/fairsched.h> ++#endif ++ ++#include <linux/nfcalls.h> ++ ++struct ve_struct *ve_list_head = NULL; ++int nr_ve = 1; /* One VE always exists. Compatibility with vestat */ ++rwlock_t ve_list_guard = RW_LOCK_UNLOCKED; ++static rwlock_t devperms_hash_guard = RW_LOCK_UNLOCKED; ++ ++extern int glob_virt_pids; ++ ++static int do_env_enter(struct ve_struct *ve); ++int real_env_create(envid_t veid, unsigned flags, u32 class_id, ++ struct env_create_param *data, int datalen); ++static void do_clean_devperms(envid_t veid); ++static int alloc_ve_tty_drivers(struct ve_struct* ve); ++static void free_ve_tty_drivers(struct ve_struct* ve); ++static int register_ve_tty_drivers(struct ve_struct* ve); ++static void unregister_ve_tty_drivers(struct ve_struct* ve); ++static int init_ve_tty_drivers(struct ve_struct *); ++static void fini_ve_tty_drivers(struct ve_struct *); ++static void clear_termios(struct tty_driver* driver ); ++static void ve_mapped_devs_cleanup(struct ve_struct *ve); ++ ++static int ve_get_cpu_stat(envid_t veid, struct vz_cpu_stat *buf); ++ ++static void vecalls_exit(void); ++ ++struct ve_struct *__find_ve_by_id(envid_t veid) ++{ ++ struct ve_struct *ve; ++ for (ve = ve_list_head; ++ ve != NULL && ve->veid != veid; ++ ve = ve->next); ++ return ve; ++} ++ ++struct ve_struct *get_ve_by_id(envid_t veid) ++{ ++ struct ve_struct *ve; ++ read_lock(&ve_list_guard); ++ ve = __find_ve_by_id(veid); ++ get_ve(ve); ++ read_unlock(&ve_list_guard); ++ return ve; ++} ++ ++/* ++ * real_put_ve() MUST be used instead of put_ve() inside vecalls. ++ */ ++void real_do_env_free(struct ve_struct *ve); ++static inline void real_put_ve(struct ve_struct *ve) ++{ ++ if (ve && atomic_dec_and_test(&ve->counter)) { ++ if (atomic_read(&ve->pcounter) > 0) ++ BUG(); ++ if (ve->is_running) ++ BUG(); ++ real_do_env_free(ve); ++ } ++} ++ ++extern struct file_system_type devpts_fs_type; ++extern struct file_system_type sysfs_fs_type; ++extern struct file_system_type tmpfs_fs_type; ++extern struct file_system_type proc_fs_type; ++ ++extern spinlock_t task_capability_lock; ++extern void ve_ipc_free(struct ve_struct * ve); ++extern void ip_fragment_cleanup(struct ve_struct *ve); ++ ++static int ve_get_cpu_stat(envid_t veid, struct vz_cpu_stat *buf) ++{ ++ struct ve_struct *ve; ++ struct vz_cpu_stat *vstat; ++ int retval; ++ int i, cpu; ++ unsigned long tmp; ++ ++ if (!ve_is_super(get_exec_env()) && (veid != get_exec_env()->veid)) ++ return -EPERM; ++ if (veid == 0) ++ return -ESRCH; ++ ++ vstat = kmalloc(sizeof(*vstat), GFP_KERNEL); ++ if (!vstat) ++ return -ENOMEM; ++ memset(vstat, 0, sizeof(*vstat)); ++ ++ retval = -ESRCH; ++ read_lock(&ve_list_guard); ++ ve = __find_ve_by_id(veid); ++ if (ve == NULL) ++ goto out_unlock; ++ for (cpu = 0; cpu < NR_CPUS; cpu++) { ++ struct ve_cpu_stats *st; ++ ++ st = VE_CPU_STATS(ve, cpu); ++ vstat->user_jif += st->user; ++ vstat->nice_jif += st->nice; ++ vstat->system_jif += st->system; ++ vstat->idle_clk += ve_sched_get_idle_time(ve, cpu); ++ } ++ vstat->uptime_clk = get_cycles() - ve->start_cycles; ++ vstat->uptime_jif = jiffies - ve->start_jiffies; ++ for (i = 0; i < 3; i++) { ++ tmp = ve->avenrun[i] + (FIXED_1/200); ++ vstat->avenrun[i].val_int = LOAD_INT(tmp); ++ vstat->avenrun[i].val_frac = LOAD_FRAC(tmp); ++ } ++ read_unlock(&ve_list_guard); ++ ++ retval = 0; ++ if (copy_to_user(buf, vstat, sizeof(*vstat))) ++ retval = -EFAULT; ++out_free: ++ kfree(vstat); ++ return retval; ++ ++out_unlock: ++ read_unlock(&ve_list_guard); ++ goto out_free; ++} ++ ++/********************************************************************** ++ * Devices permissions routines, ++ * character and block devices separately ++ **********************************************************************/ ++ ++/* Rules applied in the following order: ++ MAJOR!=0, MINOR!=0 ++ MAJOR!=0, MINOR==0 ++ MAJOR==0, MINOR==0 ++*/ ++struct devperms_struct ++{ ++ dev_t dev; /* device id */ ++ unsigned char mask; ++ unsigned type; ++ envid_t veid; ++ ++ struct devperms_struct *devhash_next; ++ struct devperms_struct **devhash_pprev; ++}; ++ ++static struct devperms_struct original_perms[] = ++{{ ++ MKDEV(0,0), /*device*/ ++ S_IROTH | S_IWOTH, ++ S_IFCHR, /*type*/ ++ 0, /*veid*/ ++ NULL, NULL ++}, ++{ ++ MKDEV(0,0), /*device*/ ++ S_IXGRP | S_IROTH | S_IWOTH, ++ S_IFBLK, /*type*/ ++ 0, /*veid*/ ++ NULL, NULL ++}}; ++ ++static struct devperms_struct default_major_perms[] = { ++ {MKDEV(UNIX98_PTY_MASTER_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR}, ++ {MKDEV(UNIX98_PTY_SLAVE_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR}, ++ {MKDEV(PTY_MASTER_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR}, ++ {MKDEV(PTY_SLAVE_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR}, ++}; ++static struct devperms_struct default_minor_perms[] = { ++ {MKDEV(MEM_MAJOR, 3), S_IROTH | S_IWOTH, S_IFCHR}, /* null */ ++ {MKDEV(MEM_MAJOR, 5), S_IROTH | S_IWOTH, S_IFCHR}, /* zero */ ++ {MKDEV(MEM_MAJOR, 7), S_IROTH | S_IWOTH, S_IFCHR}, /* full */ ++ {MKDEV(TTYAUX_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR},/* tty */ ++ {MKDEV(TTYAUX_MAJOR, 2), S_IROTH | S_IWOTH, S_IFCHR},/* ptmx */ ++ {MKDEV(MEM_MAJOR, 8), S_IROTH, S_IFCHR}, /* random */ ++ {MKDEV(MEM_MAJOR, 9), S_IROTH, S_IFCHR}, /* urandom */ ++}; ++ ++static struct devperms_struct default_deny_perms = { ++ MKDEV(0, 0), 0, S_IFCHR ++}; ++ ++static inline struct devperms_struct *find_default_devperms(int type, ++ dev_t dev) ++{ ++ int i; ++ ++ /* XXX all defaults perms are S_IFCHR */ ++ if (type != S_IFCHR) ++ return &default_deny_perms; ++ ++ for (i = 0; ++ i < sizeof(default_minor_perms)/sizeof(struct devperms_struct); ++ i++) ++ if (MAJOR(dev) == MAJOR(default_minor_perms[i].dev) && ++ MINOR(dev) == MINOR(default_minor_perms[i].dev)) ++ return &default_minor_perms[i]; ++ for (i = 0; ++ i < sizeof(default_major_perms)/sizeof(struct devperms_struct); ++ i++) ++ if (MAJOR(dev) == MAJOR(default_major_perms[i].dev)) ++ return &default_major_perms[i]; ++ ++ return &default_deny_perms; ++} ++ ++#define DEVPERMS_HASH_SZ 512 ++struct devperms_struct *devperms_hash[DEVPERMS_HASH_SZ]; ++ ++#define devperms_hashfn(id,dev) \ ++ ( (id << 5) ^ (id >> 5) ^ (MAJOR(dev)) ^ MINOR(dev) ) & \ ++ (DEVPERMS_HASH_SZ - 1) ++ ++static inline void hash_devperms(struct devperms_struct *p) ++{ ++ struct devperms_struct **htable = ++ &devperms_hash[devperms_hashfn(p->veid,p->dev)]; ++ ++ if ((p->devhash_next = *htable) != NULL) ++ (*htable)->devhash_pprev = &p->devhash_next; ++ *htable = p; ++ p->devhash_pprev = htable; ++} ++ ++static inline void unhash_devperms(struct devperms_struct *p) ++{ ++ if (p->devhash_next) ++ p->devhash_next->devhash_pprev = p->devhash_pprev; ++ *p->devhash_pprev = p->devhash_next; ++} ++ ++static int __init init_devperms_hash(void) ++{ ++ write_lock_irq(&devperms_hash_guard); ++ memset(devperms_hash, 0, sizeof(devperms_hash)); ++ hash_devperms(original_perms); ++ hash_devperms(original_perms+1); ++ write_unlock_irq(&devperms_hash_guard); ++ return 0; ++} ++ ++static inline void fini_devperms_hash(void) ++{ ++} ++ ++static inline struct devperms_struct *find_devperms(envid_t veid, ++ int type, ++ dev_t dev) ++{ ++ struct devperms_struct *p, **htable = ++ &devperms_hash[devperms_hashfn(veid,dev)]; ++ ++ for (p = *htable; p && !(p->type==type && ++ MAJOR(dev)==MAJOR(p->dev) && ++ MINOR(dev)==MINOR(p->dev) && ++ p->veid==veid); ++ p = p->devhash_next) ++ ; ++ return p; ++} ++ ++ ++static void do_clean_devperms(envid_t veid) ++{ ++ int i; ++ struct devperms_struct* ve; ++ ++ write_lock_irq(&devperms_hash_guard); ++ for (i = 0; i < DEVPERMS_HASH_SZ; i++) ++ for (ve = devperms_hash[i]; ve;) { ++ struct devperms_struct *next = ve->devhash_next; ++ if (ve->veid == veid) { ++ unhash_devperms(ve); ++ kfree(ve); ++ } ++ ++ ve = next; ++ } ++ write_unlock_irq(&devperms_hash_guard); ++} ++ ++/* ++ * Mode is a mask of ++ * FMODE_READ for read access (configurable by S_IROTH) ++ * FMODE_WRITE for write access (configurable by S_IWOTH) ++ * FMODE_QUOTACTL for quotactl access (configurable by S_IXGRP) ++ */ ++int real_get_device_perms_ve(int dev_type, dev_t dev, int access_mode) ++{ ++ struct devperms_struct *perms; ++ struct ve_struct *ve; ++ envid_t veid; ++ ++ perms = NULL; ++ ve = get_exec_env(); ++ veid = ve->veid; ++ ++ read_lock(&devperms_hash_guard); ++ ++ perms = find_devperms(veid, dev_type|VE_USE_MINOR, dev); ++ if (perms) ++ goto end; ++ ++ perms = find_devperms(veid, dev_type|VE_USE_MAJOR, MKDEV(MAJOR(dev),0)); ++ if (perms) ++ goto end; ++ ++ perms = find_devperms(veid, dev_type, MKDEV(0,0)); ++ if (perms) ++ goto end; ++ ++ perms = find_default_devperms(dev_type, dev); ++ ++end: ++ read_unlock(&devperms_hash_guard); ++ ++ access_mode = "\000\004\002\006\010\014\012\016"[access_mode]; ++ return perms ? ++ (((perms->mask & access_mode) == access_mode) ? 0 : -EACCES) : ++ -ENODEV; ++} ++ ++int do_setdevperms(envid_t veid, unsigned type, dev_t dev, unsigned mask) ++{ ++ struct devperms_struct *perms; ++ ++ write_lock_irq(&devperms_hash_guard); ++ perms = find_devperms(veid, type, dev); ++ if (!perms) { ++ struct devperms_struct *perms_new; ++ write_unlock_irq(&devperms_hash_guard); ++ ++ perms_new = kmalloc(sizeof(struct devperms_struct), GFP_KERNEL); ++ if (!perms_new) ++ return -ENOMEM; ++ ++ write_lock_irq(&devperms_hash_guard); ++ perms = find_devperms(veid, type, dev); ++ if (perms) { ++ kfree(perms_new); ++ perms_new = perms; ++ } ++ ++ switch (type & VE_USE_MASK) { ++ case 0: ++ dev = 0; ++ break; ++ case VE_USE_MAJOR: ++ dev = MKDEV(MAJOR(dev),0); ++ break; ++ } ++ ++ perms_new->veid = veid; ++ perms_new->dev = dev; ++ perms_new->type = type; ++ perms_new->mask = mask & S_IALLUGO; ++ hash_devperms(perms_new); ++ } else ++ perms->mask = mask & S_IALLUGO; ++ write_unlock_irq(&devperms_hash_guard); ++ return 0; ++} ++EXPORT_SYMBOL(do_setdevperms); ++ ++int real_setdevperms(envid_t veid, unsigned type, dev_t dev, unsigned mask) ++{ ++ struct ve_struct *ve; ++ int err; ++ ++ if (!capable(CAP_SETVEID) || veid == 0) ++ return -EPERM; ++ ++ if ((ve = get_ve_by_id(veid)) == NULL) ++ return -ESRCH; ++ ++ down_read(&ve->op_sem); ++ err = -ESRCH; ++ if (ve->is_running) ++ err = do_setdevperms(veid, type, dev, mask); ++ up_read(&ve->op_sem); ++ real_put_ve(ve); ++ return err; ++} ++ ++void real_update_load_avg_ve(void) ++{ ++ struct ve_struct *ve; ++ unsigned long nr_active; ++ ++ read_lock(&ve_list_guard); ++ for (ve = ve_list_head; ve != NULL; ve = ve->next) { ++ nr_active = nr_running_ve(ve) + nr_uninterruptible_ve(ve); ++ nr_active *= FIXED_1; ++ CALC_LOAD(ve->avenrun[0], EXP_1, nr_active); ++ CALC_LOAD(ve->avenrun[1], EXP_5, nr_active); ++ CALC_LOAD(ve->avenrun[2], EXP_15, nr_active); ++ } ++ read_unlock(&ve_list_guard); ++} ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * FS-related helpers to VE start/stop ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++/* ++ * DEVPTS needs a virtualization: each environment should see each own list of ++ * pseudo-terminals. ++ * To implement it we need to have separate devpts superblocks for each ++ * VE, and each VE should mount its own one. ++ * Thus, separate vfsmount structures are required. ++ * To minimize intrusion into vfsmount lookup code, separate file_system_type ++ * structures are created. ++ * ++ * In addition to this, patch fo character device itself is required, as file ++ * system itself is used only for MINOR/MAJOR lookup. ++ */ ++static int register_ve_fs_type(struct ve_struct *ve, ++ struct file_system_type *template, ++ struct file_system_type **p_fs_type, struct vfsmount **p_mnt) ++{ ++ struct vfsmount *mnt; ++ struct file_system_type *local_fs_type; ++ int ret; ++ ++ VZTRACE("register_ve_fs_type(\"%s\")\n", template->name); ++ ++ local_fs_type = kmalloc(sizeof(*local_fs_type) + sizeof(void *), ++ GFP_KERNEL); ++ if (local_fs_type == NULL) ++ return -ENOMEM; ++ ++ memset(local_fs_type, 0, sizeof(*local_fs_type)); ++ local_fs_type->name = template->name; ++ local_fs_type->fs_flags = template->fs_flags; ++ local_fs_type->get_sb = template->get_sb; ++ local_fs_type->kill_sb = template->kill_sb; ++ local_fs_type->owner = template->owner; ++ /* ++ * 1. we do not have refcounter on fstype ++ * 2. fstype holds reference to ve using get_ve()/put_ve(). ++ * so we free fstype when freeing ve and we are sure it's ok to free it ++ */ ++ SET_VE_OWNER_FSTYPE(local_fs_type, ve); ++ get_filesystem(local_fs_type); /* get_ve() inside */ ++ ++ ret = register_filesystem(local_fs_type); /* does not get */ ++ if (ret) ++ goto reg_err; ++ ++ mnt = kern_mount(local_fs_type); ++ if (IS_ERR(mnt)) ++ goto mnt_err; ++ ++ /* Usage counters after succesful execution kern_mount: ++ * local_fs_type - +1 (get_fs_type,get_sb_single,put_filesystem) ++ * mnt - +1 == 1 (alloc_vfsmnt) ++ */ ++ ++ *p_fs_type = local_fs_type; ++ *p_mnt = mnt; ++ return 0; ++ ++mnt_err: ++ ret = PTR_ERR(mnt); ++ unregister_filesystem(local_fs_type); /* does not put */ ++ ++reg_err: ++ put_filesystem(local_fs_type); ++ kfree(local_fs_type); ++ printk(KERN_DEBUG ++ "register_ve_fs_type(\"%s\") err=%d\n", template->name, ret); ++ return ret; ++} ++ ++static void umount_ve_fs_type(struct file_system_type *local_fs_type) ++{ ++ struct vfsmount *mnt; ++ struct list_head *p, *q; ++ LIST_HEAD(kill); ++ LIST_HEAD(umount_list); ++ ++ down_write(&namespace_sem); ++ spin_lock(&vfsmount_lock); ++ list_for_each_safe(p, q, ¤t->namespace->list) { ++ mnt = list_entry(p, struct vfsmount, mnt_list); ++ if (mnt->mnt_sb->s_type != local_fs_type) ++ continue; ++ list_del(p); ++ list_add(p, &kill); ++ } ++ ++ while (!list_empty(&kill)) { ++ mnt = list_entry(kill.next, struct vfsmount, mnt_list); ++ umount_tree(mnt, 1, &umount_list); ++ } ++ spin_unlock(&vfsmount_lock); ++ up_write(&namespace_sem); ++ release_mounts(&umount_list); ++} ++ ++static void unregister_ve_fs_type(struct file_system_type *local_fs_type, ++ struct vfsmount *local_fs_mount) ++{ ++ if (local_fs_mount == NULL || ++ local_fs_type == NULL) { ++ if (local_fs_mount != NULL || ++ local_fs_type != NULL) ++ BUG(); ++ return; ++ } ++ ++ VZTRACE("unregister_ve_fs_type(\"%s\")\n", local_fs_type->name); ++ ++ unregister_filesystem(local_fs_type); ++ umount_ve_fs_type(local_fs_type); ++ kern_umount(local_fs_mount); /* alias to mntput, drop our ref */ ++ put_filesystem(local_fs_type); ++} ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * FS-related helpers to VE start/stop ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++#ifdef CONFIG_SYSCTL ++static ctl_table ve_sysctl_tables[] = { ++ /* kernel */ ++ { ++ .ctl_name = CTL_KERN, ++ .procname = "kernel", ++ .mode = 0555, ++ .child = &ve_sysctl_tables[2], ++ }, ++ { .ctl_name = 0 }, ++ /* kernel/[vars] */ ++ { ++ .ctl_name = KERN_NODENAME, ++ .procname = "hostname", ++ .maxlen = 64, ++ .mode = 0644, ++ .proc_handler = &proc_doutsstring, ++ .strategy = &sysctl_string, ++ }, ++ { ++ .ctl_name = KERN_DOMAINNAME, ++ .procname = "domainname", ++ .maxlen = 64, ++ .mode = 0644, ++ .proc_handler = &proc_doutsstring, ++ .strategy = &sysctl_string, ++ }, ++ { ++ .ctl_name = KERN_SHMMAX, ++ .procname = "shmmax", ++ .maxlen = sizeof(size_t), ++ .mode = 0644, ++ .proc_handler = &proc_doulongvec_minmax, ++ }, ++ { ++ .ctl_name = KERN_SHMALL, ++ .procname = "shmall", ++ .maxlen = sizeof(size_t), ++ .mode = 0644, ++ .proc_handler = &proc_doulongvec_minmax, ++ }, ++ { ++ .ctl_name = KERN_SHMMNI, ++ .procname = "shmmni", ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = KERN_MSGMAX, ++ .procname = "msgmax", ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = KERN_MSGMNI, ++ .procname = "msgmni", ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = KERN_MSGMNB, ++ .procname = "msgmnb", ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = KERN_SEM, ++ .procname = "sem", ++ .maxlen = 4 * sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, ++ { .ctl_name = 0, } ++}; ++ ++static int register_ve_sysctltables(struct ve_struct *ve) ++{ ++ struct ctl_table_header *header; ++ ctl_table *root, *table; ++ ++ VZTRACE("register_ve_sysctltables\n"); ++ ++ root = clone_sysctl_template(ve_sysctl_tables, ++ sizeof(ve_sysctl_tables) / sizeof(ctl_table)); ++ if (root == NULL) ++ goto out; ++ ++ table = root->child; ++ table[0].data = &ve->utsname->nodename; ++ table[1].data = &ve->utsname->domainname; ++ table[2].data = &ve->_shm_ctlmax; ++ table[3].data = &ve->_shm_ctlall; ++ table[4].data = &ve->_shm_ctlmni; ++ table[5].data = &ve->_msg_ctlmax; ++ table[6].data = &ve->_msg_ctlmni; ++ table[7].data = &ve->_msg_ctlmnb; ++ table[8].data = &ve->_sem_ctls[0]; ++ ++ /* insert at head to override kern entries */ ++ header = register_sysctl_table(root, 1); ++ if (header == NULL) ++ goto out_free; ++ ++ ve->kern_header = header; ++ ve->kern_table = root; ++ return 0; ++ ++out_free: ++ free_sysctl_clone(root); ++out: ++ return -ENOMEM; ++} ++ ++static inline void unregister_ve_sysctltables(struct ve_struct *ve) ++{ ++ unregister_sysctl_table(ve->kern_header); ++} ++ ++static inline void free_ve_sysctltables(struct ve_struct *ve) ++{ ++ free_sysctl_clone(ve->kern_table); ++} ++#endif ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * VE start: subsystems ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#include <net/ip.h> ++#include <net/tcp.h> ++#include <net/udp.h> ++#include <net/icmp.h> ++ ++static int init_ve_utsname(struct ve_struct *ve) ++{ ++ ve->utsname = kmalloc(sizeof(*ve->utsname), GFP_KERNEL); ++ if (ve->utsname == NULL) ++ return -ENOMEM; ++ ++ down_read(&uts_sem); /* protect the source */ ++ memcpy(ve->utsname, &system_utsname, sizeof(*ve->utsname)); ++ up_read(&uts_sem); ++ ++ return 0; ++} ++ ++static void free_ve_utsname(struct ve_struct *ve) ++{ ++ kfree(ve->utsname); ++ ve->utsname = NULL; ++} ++ ++static int init_fini_ve_mibs(struct ve_struct *ve, int fini) ++{ ++ if (fini) ++ goto fini; ++ if (!(ve->_net_statistics[0] = alloc_percpu(struct linux_mib))) ++ goto out1; ++ if (!(ve->_net_statistics[1] = alloc_percpu(struct linux_mib))) ++ goto out2; ++ if (!(ve->_ip_statistics[0] = alloc_percpu(struct ipstats_mib))) ++ goto out3; ++ if (!(ve->_ip_statistics[1] = alloc_percpu(struct ipstats_mib))) ++ goto out4; ++ if (!(ve->_icmp_statistics[0] = alloc_percpu(struct icmp_mib))) ++ goto out5; ++ if (!(ve->_icmp_statistics[1] = alloc_percpu(struct icmp_mib))) ++ goto out6; ++ if (!(ve->_tcp_statistics[0] = alloc_percpu(struct tcp_mib))) ++ goto out7; ++ if (!(ve->_tcp_statistics[1] = alloc_percpu(struct tcp_mib))) ++ goto out8; ++ if (!(ve->_udp_statistics[0] = alloc_percpu(struct udp_mib))) ++ goto out9; ++ if (!(ve->_udp_statistics[1] = alloc_percpu(struct udp_mib))) ++ goto out10; ++ return 0; ++fini: ++ free_percpu(ve->_udp_statistics[1]); ++out10: ++ free_percpu(ve->_udp_statistics[0]); ++out9: ++ free_percpu(ve->_tcp_statistics[1]); ++out8: ++ free_percpu(ve->_tcp_statistics[0]); ++out7: ++ free_percpu(ve->_icmp_statistics[1]); ++out6: ++ free_percpu(ve->_icmp_statistics[0]); ++out5: ++ free_percpu(ve->_ip_statistics[1]); ++out4: ++ free_percpu(ve->_ip_statistics[0]); ++out3: ++ free_percpu(ve->_net_statistics[1]); ++out2: ++ free_percpu(ve->_net_statistics[0]); ++out1: ++ return -ENOMEM; ++} ++ ++static inline int init_ve_mibs(struct ve_struct *ve) ++{ ++ return init_fini_ve_mibs(ve, 0); ++} ++ ++static inline void fini_ve_mibs(struct ve_struct *ve) ++{ ++ (void)init_fini_ve_mibs(ve, 1); ++} ++ ++extern struct net_device templ_loopback_dev; ++static void veloop_setup(struct net_device *dev) ++{ ++ int padded; ++ padded = dev->padded; ++ memcpy(dev, &templ_loopback_dev, sizeof(struct net_device)); ++ dev->padded = padded; ++} ++ ++static int init_ve_netdev(void) ++{ ++ struct ve_struct *ve; ++ struct net_device_stats *stats; ++ int err; ++ ++ ve = get_exec_env(); ++ INIT_HLIST_HEAD(&ve->_net_dev_head); ++ ve->_net_dev_base = NULL; ++ ve->_net_dev_tail = &ve->_net_dev_base; ++ ++ ve->_loopback_dev = alloc_netdev(0, templ_loopback_dev.name, ++ veloop_setup); ++ if (ve->_loopback_dev == NULL) ++ return -ENOMEM; ++ if (loopback_dev.get_stats != NULL) { ++ stats = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); ++ if (stats != NULL) { ++ memset(stats, 0, sizeof(struct net_device_stats)); ++ ve->_loopback_dev->priv = stats; ++ ve->_loopback_dev->get_stats = loopback_dev.get_stats; ++ ve->_loopback_dev->destructor = loopback_dev.destructor; ++ } ++ } ++ err = register_netdev(ve->_loopback_dev); ++ if (err) { ++ if (ve->_loopback_dev->priv != NULL) ++ kfree(ve->_loopback_dev->priv); ++ free_netdev(ve->_loopback_dev); ++ } ++ return err; ++} ++ ++static void fini_ve_netdev(void) ++{ ++ struct ve_struct *ve; ++ struct net_device *dev; ++ ++ ve = get_exec_env(); ++ while (1) { ++ rtnl_lock(); ++ /* ++ * loopback is special, it can be referenced in fib's, ++ * so it must be freed the last. Doing so is ++ * sufficient to guarantee absence of such references. ++ */ ++ if (dev_base == ve->_loopback_dev) ++ dev = dev_base->next; ++ else ++ dev = dev_base; ++ if (dev == NULL) ++ break; ++ unregister_netdevice(dev); ++ rtnl_unlock(); ++ free_netdev(dev); ++ } ++ unregister_netdevice(ve->_loopback_dev); ++ rtnl_unlock(); ++ free_netdev(ve->_loopback_dev); ++ ve->_loopback_dev = NULL; ++} ++#else ++#define init_ve_mibs(ve) (0) ++#define fini_ve_mibs(ve) do { } while (0) ++#define init_ve_netdev() (0) ++#define fini_ve_netdev() do { } while (0) ++#endif ++ ++static int prepare_proc_root(struct ve_struct *ve) ++{ ++ struct proc_dir_entry *de; ++ ++ de = kmalloc(sizeof(struct proc_dir_entry) + 6, GFP_KERNEL); ++ if (de == NULL) ++ return -ENOMEM; ++ memset(de, 0, sizeof(struct proc_dir_entry)); ++ memcpy(de + 1, "/proc", 6); ++ de->name = (char *)(de + 1); ++ de->namelen = 5; ++ de->mode = S_IFDIR | S_IRUGO | S_IXUGO; ++ de->nlink = 2; ++ atomic_set(&de->count, 1); ++ ++ ve->proc_root = de; ++ return 0; ++} ++ ++#ifdef CONFIG_PROC_FS ++static int init_ve_proc(struct ve_struct *ve) ++{ ++ int err; ++ struct proc_dir_entry *de; ++ ++ err = prepare_proc_root(ve); ++ if (err) ++ goto out_root; ++ ++ err = register_ve_fs_type(ve, &proc_fs_type, ++ &ve->proc_fstype, &ve->proc_mnt); ++ if (err) ++ goto out_reg; ++ ++ /* create necessary /proc subdirs in VE local proc tree */ ++ err = -ENOMEM; ++ de = create_proc_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL); ++ if (!de) ++ goto out_vz; ++ ++#ifdef CONFIG_VE_IPTABLES ++ proc_net = proc_mkdir("net", NULL); ++ if (!proc_net) ++ goto out_net; ++#endif ++ ++ return 0; ++ ++#ifdef CONFIG_VE_IPTABLES ++out_net: ++ remove_proc_entry("vz", NULL); ++#endif ++out_vz: ++ unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt); ++ ve->proc_mnt = NULL; ++out_reg: ++ /* proc_fstype and proc_root are freed in real_put_ve -> free_ve_proc */ ++ ; ++out_root: ++ return err; ++} ++ ++static void fini_ve_proc(struct ve_struct *ve) ++{ ++#ifdef CONFIG_VE_IPTABLES ++ remove_proc_entry("net", NULL); ++ proc_net = NULL; ++#endif ++ remove_proc_entry("vz", NULL); ++ unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt); ++ ve->proc_mnt = NULL; ++} ++ ++static void free_ve_proc(struct ve_struct *ve) ++{ ++ /* proc filesystem frees proc_dir_entries on remove_proc_entry() only, ++ so we check that everything was removed and not lost */ ++ if (ve->proc_root && ve->proc_root->subdir) { ++ struct proc_dir_entry *p = ve->proc_root; ++ printk(KERN_WARNING "VPS: %d: proc entry /proc", ve->veid); ++ while ((p = p->subdir) != NULL) ++ printk("/%s", p->name); ++ printk(" is not removed!\n"); ++ } ++ ++ kfree(ve->proc_root); ++ kfree(ve->proc_fstype); ++ ++ ve->proc_fstype = NULL; ++ ve->proc_root = NULL; ++} ++#else ++#define init_ve_proc(ve) (0) ++#define fini_ve_proc(ve) do { } while (0) ++#define free_ve_proc(ve) do { } while (0) ++#endif ++ ++#ifdef CONFIG_SYSCTL ++static int init_ve_sysctl(struct ve_struct *ve) ++{ ++ int err; ++ ++#ifdef CONFIG_PROC_FS ++ err = -ENOMEM; ++ ve->proc_sys_root = proc_mkdir("sys", 0); ++ if (ve->proc_sys_root == NULL) ++ goto out_proc; ++#endif ++ INIT_LIST_HEAD(&ve->sysctl_lh); ++ err = register_ve_sysctltables(ve); ++ if (err) ++ goto out_reg; ++ ++ err = devinet_sysctl_init(ve); ++ if (err) ++ goto out_dev; ++ ++ return 0; ++ ++out_dev: ++ unregister_ve_sysctltables(ve); ++ free_ve_sysctltables(ve); ++out_reg: ++#ifdef CONFIG_PROC_FS ++ remove_proc_entry("sys", NULL); ++out_proc: ++#endif ++ return err; ++} ++ ++static void fini_ve_sysctl(struct ve_struct *ve) ++{ ++ devinet_sysctl_fini(ve); ++ unregister_ve_sysctltables(ve); ++ remove_proc_entry("sys", NULL); ++} ++ ++static void free_ve_sysctl(struct ve_struct *ve) ++{ ++ devinet_sysctl_free(ve); ++ free_ve_sysctltables(ve); ++} ++#else ++#define init_ve_sysctl(ve) (0) ++#define fini_ve_sysctl(ve) do { } while (0) ++#define free_ve_sysctl(ve) do { } while (0) ++#endif ++ ++#ifdef CONFIG_UNIX98_PTYS ++#include <linux/devpts_fs.h> ++ ++static int init_ve_devpts(struct ve_struct *ve) ++{ ++ int err; ++ ++ err = -ENOMEM; ++ ve->devpts_config = kmalloc(sizeof(struct devpts_config), GFP_KERNEL); ++ if (ve->devpts_config == NULL) ++ goto out; ++ memset(ve->devpts_config, 0, sizeof(struct devpts_config)); ++ ve->devpts_config->mode = 0600; ++ err = register_ve_fs_type(ve, &devpts_fs_type, ++ &ve->devpts_fstype, &ve->devpts_mnt); ++ if (err) { ++ kfree(ve->devpts_config); ++ ve->devpts_config = NULL; ++ } ++out: ++ return err; ++} ++ ++static void fini_ve_devpts(struct ve_struct *ve) ++{ ++ unregister_ve_fs_type(ve->devpts_fstype, ve->devpts_mnt); ++ /* devpts_fstype is freed in real_put_ve -> free_ve_filesystems */ ++ ve->devpts_mnt = NULL; ++ kfree(ve->devpts_config); ++ ve->devpts_config = NULL; ++} ++#else ++#define init_ve_devpts(ve) (0) ++#define fini_ve_devpts(ve) do { } while (0) ++#endif ++ ++static int init_ve_shmem(struct ve_struct *ve) ++{ ++ return register_ve_fs_type(ve, ++ &tmpfs_fs_type, ++ &ve->shmem_fstype, ++ &ve->shmem_mnt); ++} ++ ++static void fini_ve_shmem(struct ve_struct *ve) ++{ ++ unregister_ve_fs_type(ve->shmem_fstype, ve->shmem_mnt); ++ /* shmem_fstype is freed in real_put_ve -> free_ve_filesystems */ ++ ve->shmem_mnt = NULL; ++} ++ ++static int init_ve_sysfs(struct ve_struct *ve) ++{ ++ struct subsystem *subsys; ++ struct class *nc; ++ int err; ++ extern struct subsystem class_obj_subsys; ++ extern struct subsystem class_subsys; ++ extern struct class net_class; ++ ++#ifdef CONFIG_VE_SYSFS ++ err = register_ve_fs_type(ve, ++ &sysfs_fs_type, ++ &ve->sysfs_fstype, ++ &ve->sysfs_mnt); ++ if (err != 0) ++ goto out_fs_type; ++#endif ++ err = -ENOMEM; ++ subsys = kmalloc(sizeof(*subsys), GFP_KERNEL); ++ if (subsys == NULL) ++ goto out_class_obj; ++ /* ick, this is ugly, the things we go through to keep from showing up ++ * in sysfs... */ ++ memset(subsys, 0, sizeof(*subsys)); ++ memcpy(&subsys->kset.kobj.name, &class_obj_subsys.kset.kobj.name, ++ sizeof(subsys->kset.kobj.name)); ++ subsys->kset.ktype = class_obj_subsys.kset.ktype; ++ subsys->kset.hotplug_ops = class_obj_subsys.kset.hotplug_ops; ++ subsystem_init(subsys); ++ if (!subsys->kset.subsys) ++ subsys->kset.subsys = subsys; ++ ve->class_obj_subsys = subsys; ++ ++ err = -ENOMEM; ++ subsys = kmalloc(sizeof(*subsys), GFP_KERNEL); ++ if (subsys == NULL) ++ goto out_class_subsys; ++ /* ick, this is ugly, the things we go through to keep from showing up ++ * in sysfs... */ ++ memset(subsys, 0, sizeof(*subsys)); ++ memcpy(&subsys->kset.kobj.name, &class_subsys.kset.kobj.name, ++ sizeof(subsys->kset.kobj.name)); ++ subsys->kset.ktype = class_subsys.kset.ktype; ++ subsys->kset.hotplug_ops = class_subsys.kset.hotplug_ops; ++ ve->class_subsys = subsys; ++ err = subsystem_register(subsys); ++ if (err != 0) ++ goto out_register; ++ ++ err = -ENOMEM; ++ nc = kmalloc(sizeof(*nc), GFP_KERNEL); ++ if (nc == NULL) ++ goto out_nc; ++ memset(nc, 0, sizeof(*nc)); ++ nc->name = net_class.name; ++ nc->release = net_class.release; ++ nc->hotplug = net_class.hotplug; ++ err = class_register(nc); ++ if (err != 0) ++ goto out_class_register; ++ ve->net_class = nc; ++ ++ return err; ++ ++out_class_register: ++ kfree(nc); ++out_nc: ++ subsystem_unregister(subsys); ++out_register: ++ kfree(ve->class_subsys); ++out_class_subsys: ++ kfree(ve->class_obj_subsys); ++out_class_obj: ++#ifdef CONFIG_VE_SYSFS ++ unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt); ++ /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */ ++out_fs_type: ++#endif ++ ve->class_subsys = NULL; ++ ve->class_obj_subsys = NULL; ++ return err; ++} ++ ++static void fini_ve_sysfs(struct ve_struct *ve) ++{ ++ class_unregister(ve->net_class); ++ subsystem_unregister(ve->class_subsys); ++ ++ kfree(ve->net_class); ++ kfree(ve->class_subsys); ++ kfree(ve->class_obj_subsys); ++ ++ ve->net_class = NULL; ++ ve->class_subsys = NULL; ++ ve->class_obj_subsys = NULL; ++#ifdef CONFIG_VE_SYSFS ++ unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt); ++ ve->sysfs_mnt = NULL; ++ /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */ ++#endif ++} ++ ++static void free_ve_filesystems(struct ve_struct *ve) ++{ ++#ifdef CONFIG_VE_SYSFS ++ kfree(ve->sysfs_fstype); ++ ve->sysfs_fstype = NULL; ++#endif ++ kfree(ve->shmem_fstype); ++ ve->shmem_fstype = NULL; ++ ++ kfree(ve->devpts_fstype); ++ ve->devpts_fstype = NULL; ++ ++ free_ve_proc(ve); ++} ++ ++static int init_printk(struct ve_struct *ve) ++{ ++ struct ve_prep_printk { ++ wait_queue_head_t log_wait; ++ unsigned long log_start; ++ unsigned long log_end; ++ unsigned long logged_chars; ++ } *tmp; ++ ++ tmp = kmalloc(sizeof(struct ve_prep_printk), GFP_KERNEL); ++ if (!tmp) ++ return -ENOMEM; ++ memset(tmp, 0, sizeof(struct ve_prep_printk)); ++ init_waitqueue_head(&tmp->log_wait); ++ ve->_log_wait = &tmp->log_wait; ++ ve->_log_start = &tmp->log_start; ++ ve->_log_end = &tmp->log_end; ++ ve->_logged_chars = &tmp->logged_chars; ++ /* ve->log_buf will be initialized later by ve_log_init() */ ++ return 0; ++} ++ ++static void fini_printk(struct ve_struct *ve) ++{ ++ /* ++ * there is no spinlock protection here because nobody can use ++ * log_buf at the moments when this code is called. ++ */ ++ kfree(ve->log_buf); ++ kfree(ve->_log_wait); ++} ++ ++static void fini_venet(struct ve_struct *ve) ++{ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ tcp_v4_kill_ve_sockets(ve); ++#endif ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ ve_mapped_devs_cleanup(ve); ++#endif ++} ++ ++static int init_ve_sched(struct ve_struct *ve) ++{ ++#ifdef CONFIG_FAIRSCHED ++ int err; ++ ++ /* ++ * We refuse to switch to an already existing node since nodes ++ * keep a pointer to their ve_struct... ++ */ ++ err = sys_fairsched_mknod(0, 1, ve->veid); ++ if (err < 0) { ++ printk(KERN_WARNING "Can't create fairsched node %d\n", ++ ve->veid); ++ return err; ++ } ++ err = sys_fairsched_mvpr(current->pid, ve->veid); ++ if (err) { ++ printk(KERN_WARNING "Can't switch to fairsched node %d\n", ++ ve->veid); ++ if (sys_fairsched_rmnod(ve->veid)) ++ printk(KERN_ERR "Can't clean fairsched node %d\n", ++ ve->veid); ++ return err; ++ } ++#endif ++ ve_sched_attach(ve); ++ return 0; ++} ++ ++static void fini_ve_sched(struct ve_struct *ve) ++{ ++#ifdef CONFIG_FAIRSCHED ++ if (task_vsched_id(current) == ve->veid) ++ if (sys_fairsched_mvpr(current->pid, fairsched_init_node.id)) ++ printk(KERN_WARNING "Can't leave fairsched node %d\n", ++ ve->veid); ++ if (sys_fairsched_rmnod(ve->veid)) ++ printk(KERN_ERR "Can't remove fairsched node %d\n", ++ ve->veid); ++#endif ++} ++ ++static int init_ve_struct(struct ve_struct *ve, envid_t veid, ++ u32 class_id, struct task_struct *init_tsk) ++{ ++ int n; ++ ++ memset(ve, 0, sizeof(struct ve_struct)); ++ (void)get_ve(ve); ++ ve->veid = veid; ++ ve->class_id = class_id; ++ ve->init_entry = init_tsk; ++ INIT_LIST_HEAD(&ve->vetask_lh); ++ init_rwsem(&ve->op_sem); ++ ve->ifindex = -1; ++ ++ for(n = 0; n < UIDHASH_SZ_VE; ++n) ++ INIT_LIST_HEAD(&ve->uidhash_table[n]); ++ ++ do_posix_clock_monotonic_gettime(&ve->start_timespec); ++ ve->start_jiffies = jiffies; ++ ve->start_cycles = get_cycles(); ++ ve->virt_pids = glob_virt_pids; ++ ++ return 0; ++} ++ ++static void set_ve_root(struct ve_struct *ve, struct task_struct *tsk) ++{ ++ read_lock(&tsk->fs->lock); ++ ve->fs_rootmnt = tsk->fs->rootmnt; ++ ve->fs_root = tsk->fs->root; ++ read_unlock(&tsk->fs->lock); ++ mark_tree_virtual(ve->fs_rootmnt, ve->fs_root); ++} ++ ++static void set_ve_caps(struct ve_struct *ve, struct task_struct *tsk) ++{ ++ /* required for real_setdevperms from register_ve_<fs> above */ ++ memcpy(&ve->cap_default, &tsk->cap_effective, sizeof(kernel_cap_t)); ++ cap_lower(ve->cap_default, CAP_SETVEID); ++} ++ ++static int ve_list_add(struct ve_struct *ve) ++{ ++ write_lock_irq(&ve_list_guard); ++ if (__find_ve_by_id(ve->veid) != NULL) ++ goto err_exists; ++ ++ ve->prev = NULL; ++ ve->next = ve_list_head; ++ if (ve_list_head) ++ ve_list_head->prev = ve; ++ ve_list_head = ve; ++ nr_ve++; ++ write_unlock_irq(&ve_list_guard); ++ return 0; ++ ++err_exists: ++ write_unlock_irq(&ve_list_guard); ++ return -EEXIST; ++} ++ ++static void ve_list_del(struct ve_struct *ve) ++{ ++ write_lock_irq(&ve_list_guard); ++ if (ve->prev) ++ ve->prev->next = ve->next; ++ else ++ ve_list_head = ve->next; ++ if (ve->next) ++ ve->next->prev = ve->prev; ++ nr_ve--; ++ write_unlock_irq(&ve_list_guard); ++} ++ ++static void set_task_ve_caps(struct task_struct *tsk, struct ve_struct *ve) ++{ ++ spin_lock(&task_capability_lock); ++ cap_mask(tsk->cap_effective, ve->cap_default); ++ cap_mask(tsk->cap_inheritable, ve->cap_default); ++ cap_mask(tsk->cap_permitted, ve->cap_default); ++ spin_unlock(&task_capability_lock); ++} ++ ++static void move_task(struct task_struct *tsk, struct ve_struct *new, ++ struct ve_struct *old) ++{ ++ /* this probihibts ptracing of task entered to VPS from host system */ ++ tsk->mm->vps_dumpable = 0; ++ /* setup capabilities before enter */ ++ set_task_ve_caps(tsk, new); ++ ++ write_lock_irq(&tasklist_lock); ++ VE_TASK_INFO(tsk)->owner_env = new; ++ VE_TASK_INFO(tsk)->exec_env = new; ++ REMOVE_VE_LINKS(tsk); ++ SET_VE_LINKS(tsk); ++ ++ atomic_dec(&old->pcounter); ++ atomic_inc(&new->pcounter); ++ real_put_ve(old); ++ get_ve(new); ++ write_unlock_irq(&tasklist_lock); ++} ++ ++#if (defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)) && \ ++ defined(CONFIG_NETFILTER) && defined(CONFIG_VE_IPTABLES) ++#define init_ve_netfilter() init_netfilter() ++#define fini_ve_netfilter() fini_netfilter() ++#else ++#define init_ve_netfilter() (0) ++#define fini_ve_netfilter() do { } while (0) ++#endif ++ ++#define KSYMIPTINIT(mask, ve, full_mask, mod, name, args) \ ++({ \ ++ int ret = 0; \ ++ if (VE_IPT_CMP(mask, full_mask) && \ ++ VE_IPT_CMP((ve)->_iptables_modules, \ ++ full_mask & ~(full_mask##_MOD))) { \ ++ ret = KSYMERRCALL(1, mod, name, args); \ ++ if (ret == 0) \ ++ (ve)->_iptables_modules |= \ ++ full_mask##_MOD; \ ++ if (ret == 1) \ ++ ret = 0; \ ++ } \ ++ ret; \ ++}) ++ ++#define KSYMIPTFINI(mask, full_mask, mod, name, args) \ ++({ \ ++ if (VE_IPT_CMP(mask, full_mask##_MOD)) \ ++ KSYMSAFECALL_VOID(mod, name, args); \ ++}) ++ ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++static int do_ve_iptables(struct ve_struct *ve, __u64 init_mask, ++ int init_or_cleanup) ++{ ++ int err; ++ ++ err = 0; ++ if (!init_or_cleanup) ++ goto cleanup; ++ ++ /* init part */ ++#if defined(CONFIG_IP_NF_IPTABLES) || \ ++ defined(CONFIG_IP_NF_IPTABLES_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLES, ++ ip_tables, init_iptables, ()); ++ if (err < 0) ++ goto err_iptables; ++#endif ++#if defined(CONFIG_IP_NF_CONNTRACK) || \ ++ defined(CONFIG_IP_NF_CONNTRACK_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK, ++ ip_conntrack, init_iptable_conntrack, ()); ++ if (err < 0) ++ goto err_iptable_conntrack; ++#endif ++#if defined(CONFIG_IP_NF_FTP) || \ ++ defined(CONFIG_IP_NF_FTP_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK_FTP, ++ ip_conntrack_ftp, init_iptable_ftp, ()); ++ if (err < 0) ++ goto err_iptable_ftp; ++#endif ++#if defined(CONFIG_IP_NF_IRC) || \ ++ defined(CONFIG_IP_NF_IRC_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK_IRC, ++ ip_conntrack_irc, init_iptable_irc, ()); ++ if (err < 0) ++ goto err_iptable_irc; ++#endif ++#if defined(CONFIG_IP_NF_MATCH_CONNTRACK) || \ ++ defined(CONFIG_IP_NF_MATCH_CONNTRACK_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_CONNTRACK, ++ ipt_conntrack, init_iptable_conntrack_match, ()); ++ if (err < 0) ++ goto err_iptable_conntrack_match; ++#endif ++#if defined(CONFIG_IP_NF_MATCH_STATE) || \ ++ defined(CONFIG_IP_NF_MATCH_STATE_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_STATE, ++ ipt_state, init_iptable_state, ()); ++ if (err < 0) ++ goto err_iptable_state; ++#endif ++#if defined(CONFIG_IP_NF_MATCH_HELPER) || \ ++ defined(CONFIG_IP_NF_MATCH_HELPER_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_HELPER, ++ ipt_helper, init_iptable_helper, ()); ++ if (err < 0) ++ goto err_iptable_helper; ++#endif ++#if defined(CONFIG_IP_NF_NAT) || \ ++ defined(CONFIG_IP_NF_NAT_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT, ++ ip_nat, ip_nat_init, ()); ++ if (err < 0) ++ goto err_iptable_nat; ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT, ++ iptable_nat, init_iptable_nat, ()); ++ if (err < 0) ++ goto err_iptable_nat2; ++#endif ++#if defined(CONFIG_IP_NF_NAT_FTP) || \ ++ defined(CONFIG_IP_NF_NAT_FTP_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT_FTP, ++ ip_nat_ftp, init_iptable_nat_ftp, ()); ++ if (err < 0) ++ goto err_iptable_nat_ftp; ++#endif ++#if defined(CONFIG_IP_NF_NAT_IRC) || \ ++ defined(CONFIG_IP_NF_NAT_IRC_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT_IRC, ++ ip_nat_irc, init_iptable_nat_irc, ()); ++ if (err < 0) ++ goto err_iptable_nat_irc; ++#endif ++#if defined(CONFIG_IP_NF_FILTER) || \ ++ defined(CONFIG_IP_NF_FILTER_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_FILTER, ++ iptable_filter, init_iptable_filter, ()); ++ if (err < 0) ++ goto err_iptable_filter; ++#endif ++#if defined(CONFIG_IP_NF_MANGLE) || \ ++ defined(CONFIG_IP_NF_MANGLE_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MANGLE, ++ iptable_mangle, init_iptable_mangle, ()); ++ if (err < 0) ++ goto err_iptable_mangle; ++#endif ++#if defined(CONFIG_IP_NF_MATCH_LIMIT) || \ ++ defined(CONFIG_IP_NF_MATCH_LIMIT_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_LIMIT, ++ ipt_limit, init_iptable_limit, ()); ++ if (err < 0) ++ goto err_iptable_limit; ++#endif ++#if defined(CONFIG_IP_NF_MATCH_MULTIPORT) || \ ++ defined(CONFIG_IP_NF_MATCH_MULTIPORT_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_MULTIPORT, ++ ipt_multiport, init_iptable_multiport, ()); ++ if (err < 0) ++ goto err_iptable_multiport; ++#endif ++#if defined(CONFIG_IP_NF_MATCH_TOS) || \ ++ defined(CONFIG_IP_NF_MATCH_TOS_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_TOS, ++ ipt_tos, init_iptable_tos, ()); ++ if (err < 0) ++ goto err_iptable_tos; ++#endif ++#if defined(CONFIG_IP_NF_TARGET_TOS) || \ ++ defined(CONFIG_IP_NF_TARGET_TOS_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_TOS, ++ ipt_TOS, init_iptable_TOS, ()); ++ if (err < 0) ++ goto err_iptable_TOS; ++#endif ++#if defined(CONFIG_IP_NF_TARGET_REJECT) || \ ++ defined(CONFIG_IP_NF_TARGET_REJECT_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_REJECT, ++ ipt_REJECT, init_iptable_REJECT, ()); ++ if (err < 0) ++ goto err_iptable_REJECT; ++#endif ++#if defined(CONFIG_IP_NF_TARGET_TCPMSS) || \ ++ defined(CONFIG_IP_NF_TARGET_TCPMSS_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_TCPMSS, ++ ipt_TCPMSS, init_iptable_TCPMSS, ()); ++ if (err < 0) ++ goto err_iptable_TCPMSS; ++#endif ++#if defined(CONFIG_IP_NF_MATCH_TCPMSS) || \ ++ defined(CONFIG_IP_NF_MATCH_TCPMSS_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_TCPMSS, ++ ipt_tcpmss, init_iptable_tcpmss, ()); ++ if (err < 0) ++ goto err_iptable_tcpmss; ++#endif ++#if defined(CONFIG_IP_NF_MATCH_TTL) || \ ++ defined(CONFIG_IP_NF_MATCH_TTL_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_TTL, ++ ipt_ttl, init_iptable_ttl, ()); ++ if (err < 0) ++ goto err_iptable_ttl; ++#endif ++#if defined(CONFIG_IP_NF_TARGET_LOG) || \ ++ defined(CONFIG_IP_NF_TARGET_LOG_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_LOG, ++ ipt_LOG, init_iptable_LOG, ()); ++ if (err < 0) ++ goto err_iptable_LOG; ++#endif ++#if defined(CONFIG_IP_NF_MATCH_LENGTH) || \ ++ defined(CONFIG_IP_NF_MATCH_LENGTH_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_LENGTH, ++ ipt_length, init_iptable_length, ()); ++ if (err < 0) ++ goto err_iptable_length; ++#endif ++ return 0; ++ ++/* ------------------------------------------------------------------------- */ ++ ++cleanup: ++#if defined(CONFIG_IP_NF_MATCH_LENGTH) || \ ++ defined(CONFIG_IP_NF_MATCH_LENGTH_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_LENGTH, ++ ipt_length, fini_iptable_length, ()); ++err_iptable_length: ++#endif ++#if defined(CONFIG_IP_NF_TARGET_LOG) || \ ++ defined(CONFIG_IP_NF_TARGET_LOG_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_LOG, ++ ipt_LOG, fini_iptable_LOG, ()); ++err_iptable_LOG: ++#endif ++#if defined(CONFIG_IP_NF_MATCH_TTL) || \ ++ defined(CONFIG_IP_NF_MATCH_TTL_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_TTL, ++ ipt_ttl, fini_iptable_ttl, ()); ++err_iptable_ttl: ++#endif ++#if defined(CONFIG_IP_NF_MATCH_TCPMSS) || \ ++ defined(CONFIG_IP_NF_MATCH_TCPMSS_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_TCPMSS, ++ ipt_tcpmss, fini_iptable_tcpmss, ()); ++err_iptable_tcpmss: ++#endif ++#if defined(CONFIG_IP_NF_TARGET_TCPMSS) || \ ++ defined(CONFIG_IP_NF_TARGET_TCPMSS_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_TCPMSS, ++ ipt_TCPMSS, fini_iptable_TCPMSS, ()); ++err_iptable_TCPMSS: ++#endif ++#if defined(CONFIG_IP_NF_TARGET_REJECT) || \ ++ defined(CONFIG_IP_NF_TARGET_REJECT_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_REJECT, ++ ipt_REJECT, fini_iptable_REJECT, ()); ++err_iptable_REJECT: ++#endif ++#if defined(CONFIG_IP_NF_TARGET_TOS) || \ ++ defined(CONFIG_IP_NF_TARGET_TOS_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_TOS, ++ ipt_TOS, fini_iptable_TOS, ()); ++err_iptable_TOS: ++#endif ++#if defined(CONFIG_IP_NF_MATCH_TOS) || \ ++ defined(CONFIG_IP_NF_MATCH_TOS_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_TOS, ++ ipt_tos, fini_iptable_tos, ()); ++err_iptable_tos: ++#endif ++#if defined(CONFIG_IP_NF_MATCH_MULTIPORT) || \ ++ defined(CONFIG_IP_NF_MATCH_MULTIPORT_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_MULTIPORT, ++ ipt_multiport, fini_iptable_multiport, ()); ++err_iptable_multiport: ++#endif ++#if defined(CONFIG_IP_NF_MATCH_LIMIT) || \ ++ defined(CONFIG_IP_NF_MATCH_LIMIT_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_LIMIT, ++ ipt_limit, fini_iptable_limit, ()); ++err_iptable_limit: ++#endif ++#if defined(CONFIG_IP_NF_MANGLE) || \ ++ defined(CONFIG_IP_NF_MANGLE_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MANGLE, ++ iptable_mangle, fini_iptable_mangle, ()); ++err_iptable_mangle: ++#endif ++#if defined(CONFIG_IP_NF_FILTER) || \ ++ defined(CONFIG_IP_NF_FILTER_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_FILTER, ++ iptable_filter, fini_iptable_filter, ()); ++err_iptable_filter: ++#endif ++#if defined(CONFIG_IP_NF_NAT_IRC) || \ ++ defined(CONFIG_IP_NF_NAT_IRC_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT_IRC, ++ ip_nat_irc, fini_iptable_nat_irc, ()); ++err_iptable_nat_irc: ++#endif ++#if defined(CONFIG_IP_NF_NAT_FTP) || \ ++ defined(CONFIG_IP_NF_NAT_FTP_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT_FTP, ++ ip_nat_ftp, fini_iptable_nat_ftp, ()); ++err_iptable_nat_ftp: ++#endif ++#if defined(CONFIG_IP_NF_NAT) || \ ++ defined(CONFIG_IP_NF_NAT_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT, ++ iptable_nat, fini_iptable_nat, ()); ++err_iptable_nat2: ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT, ++ ip_nat, ip_nat_cleanup, ()); ++err_iptable_nat: ++#endif ++#if defined(CONFIG_IP_NF_MATCH_HELPER) || \ ++ defined(CONFIG_IP_NF_MATCH_HELPER_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_HELPER, ++ ipt_helper, fini_iptable_helper, ()); ++err_iptable_helper: ++#endif ++#if defined(CONFIG_IP_NF_MATCH_STATE) || \ ++ defined(CONFIG_IP_NF_MATCH_STATE_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_STATE, ++ ipt_state, fini_iptable_state, ()); ++err_iptable_state: ++#endif ++#if defined(CONFIG_IP_NF_MATCH_CONNTRACK) || \ ++ defined(CONFIG_IP_NF_MATCH_CONNTRACK_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_CONNTRACK, ++ ipt_conntrack, fini_iptable_conntrack_match, ()); ++err_iptable_conntrack_match: ++#endif ++#if defined(CONFIG_IP_NF_IRC) || \ ++ defined(CONFIG_IP_NF_IRC_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK_IRC, ++ ip_conntrack_irc, fini_iptable_irc, ()); ++err_iptable_irc: ++#endif ++#if defined(CONFIG_IP_NF_FTP) || \ ++ defined(CONFIG_IP_NF_FTP_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK_FTP, ++ ip_conntrack_ftp, fini_iptable_ftp, ()); ++err_iptable_ftp: ++#endif ++#if defined(CONFIG_IP_NF_CONNTRACK) || \ ++ defined(CONFIG_IP_NF_CONNTRACK_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK, ++ ip_conntrack, fini_iptable_conntrack, ()); ++err_iptable_conntrack: ++#endif ++#if defined(CONFIG_IP_NF_IPTABLES) || \ ++ defined(CONFIG_IP_NF_IPTABLES_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLES, ++ ip_tables, fini_iptables, ()); ++err_iptables: ++#endif ++ ve->_iptables_modules = 0; ++ ++ return err; ++} ++#else ++#define do_ve_iptables(ve, initmask, init) (0) ++#endif ++ ++static inline int init_ve_iptables(struct ve_struct *ve, __u64 init_mask) ++{ ++ return do_ve_iptables(ve, init_mask, 1); ++} ++ ++static inline void fini_ve_iptables(struct ve_struct *ve, __u64 init_mask) ++{ ++ (void)do_ve_iptables(ve, init_mask, 0); ++} ++ ++static void flush_ve_iptables(struct ve_struct *ve) ++{ ++ /* ++ * flush all rule tables first, ++ * this helps us to avoid refs to freed objs ++ */ ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MANGLE, ip_tables, ++ ipt_flush_table, (ve->_ipt_mangle_table)); ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_FILTER, ip_tables, ++ ipt_flush_table, (ve->_ve_ipt_filter_pf)); ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT, ip_tables, ++ ipt_flush_table, (ve->_ip_conntrack->_ip_nat_table)); ++} ++ ++static struct list_head ve_hooks[VE_MAX_HOOKS]; ++static DECLARE_RWSEM(ve_hook_sem); ++ ++int ve_hook_register(struct ve_hook *vh) ++{ ++ struct list_head *lh; ++ struct ve_hook *tmp; ++ ++ down_write(&ve_hook_sem); ++ list_for_each(lh, &ve_hooks[vh->hooknum]) { ++ tmp = list_entry(lh, struct ve_hook, list); ++ if (vh->priority < tmp->priority) ++ break; ++ } ++ list_add_tail(&vh->list, lh); ++ up_write(&ve_hook_sem); ++ return 0; ++} ++EXPORT_SYMBOL(ve_hook_register); ++ ++void ve_hook_unregister(struct ve_hook *vh) ++{ ++ down_write(&ve_hook_sem); ++ list_del(&vh->list); ++ up_write(&ve_hook_sem); ++} ++EXPORT_SYMBOL(ve_hook_unregister); ++ ++static int ve_hook_iterate(unsigned int hooknum, void *data) ++{ ++ struct ve_hook *vh; ++ int err; ++ ++ err = 0; ++ down_read(&ve_hook_sem); ++ list_for_each_entry(vh, &ve_hooks[hooknum], list) { ++ if (!try_module_get(vh->owner)) ++ continue; ++ err = vh->hook(hooknum, data); ++ module_put(vh->owner); ++ if (err) ++ break; ++ } ++ ++ if (err) { ++ list_for_each_entry_continue_reverse(vh, ++ &ve_hooks[hooknum], list) { ++ if (!try_module_get(vh->owner)) ++ continue; ++ if (vh->undo) ++ vh->undo(hooknum, data); ++ module_put(vh->owner); ++ } ++ } ++ up_read(&ve_hook_sem); ++ return err; ++} ++ ++static void ve_hook_iterate_cleanup(unsigned int hooknum, void *data) ++{ ++ struct ve_hook *vh; ++ ++ down_read(&ve_hook_sem); ++ list_for_each_entry_reverse(vh, &ve_hooks[hooknum], list) { ++ if (!try_module_get(vh->owner)) ++ continue; ++ (void)vh->hook(hooknum, data); ++ module_put(vh->owner); ++ } ++ up_read(&ve_hook_sem); ++} ++ ++static int do_env_create(envid_t veid, u32 class_id, ++ struct env_create_param *data, int datalen) ++{ ++ struct task_struct *tsk; ++ struct ve_struct *old; ++ struct ve_struct *old_exec; ++ struct ve_struct *ve; ++ struct ve_hook_init_data vhd; ++ __u64 init_mask; ++ int err; ++ ++ tsk = current; ++ old = VE_TASK_INFO(tsk)->owner_env; ++ ++ if (!thread_group_leader(tsk)) ++ return -EINVAL; ++ ++ if (tsk->signal->tty) { ++ printk("ERR: VE init has controlling terminal\n"); ++ return -EINVAL; ++ } ++ if (tsk->signal->pgrp != tsk->pid || tsk->signal->session != tsk->pid) { ++ int may_setsid; ++ read_lock(&tasklist_lock); ++ may_setsid = (find_pid(PIDTYPE_PGID, tsk->pid) == NULL); ++ read_unlock(&tasklist_lock); ++ if (!may_setsid) { ++ printk("ERR: VE init is process group leader\n"); ++ return -EINVAL; ++ } ++ } ++ ++ ++ VZTRACE("%s: veid=%d classid=%d pid=%d\n", ++ __FUNCTION__, veid, class_id, current->pid); ++ ++ err = -ENOMEM; ++ ve = kmalloc(sizeof(struct ve_struct), GFP_KERNEL); ++ if (ve == NULL) ++ goto err_struct; ++ ++ init_ve_struct(ve, veid, class_id, tsk); ++ __module_get(THIS_MODULE); ++ down_write(&ve->op_sem); ++ if ((err = ve_list_add(ve)) < 0) ++ goto err_exist; ++ ++ /* this should be done before context switching */ ++ if ((err = init_printk(ve)) < 0) ++ goto err_log_wait; ++ ++ old_exec = set_exec_env(ve); ++ ++ if ((err = init_ve_sched(ve)) < 0) ++ goto err_sched; ++ ++ /* move user to VE */ ++ if ((err = set_user(0, 0)) < 0) ++ goto err_set_user; ++ ++ set_ve_root(ve, tsk); ++ ++ if ((err = init_ve_utsname(ve))) ++ goto err_utsname; ++ ++ if ((err = init_ve_mibs(ve))) ++ goto err_mibs; ++ ++ if ((err = init_ve_proc(ve))) ++ goto err_proc; ++ ++ if ((err = init_ve_sysctl(ve))) ++ goto err_sysctl; ++ ++ if ((err = init_ve_sysfs(ve))) ++ goto err_sysfs; ++ ++ if ((err = init_ve_route(ve)) < 0) ++ goto err_route; ++ ++ if ((err = init_ve_netdev())) ++ goto err_dev; ++ ++ if ((err = init_ve_tty_drivers(ve)) < 0) ++ goto err_tty; ++ ++ if ((err = init_ve_shmem(ve))) ++ goto err_shmem; ++ ++ if ((err = init_ve_devpts(ve))) ++ goto err_devpts; ++ ++ /* init SYSV IPC variables */ ++ if ((err = init_ve_ipc(ve)) < 0) ++ goto err_ipc; ++ ++ set_ve_caps(ve, tsk); ++ ++ if (alloc_vpid(tsk->pid, 1) < 0) { ++ err = -EBUSY; ++ goto err_vpid; ++ } ++ set_virt_pid(tsk, 1); ++ set_virt_tgid(tsk, 1); ++ ++ set_special_pids(tsk->pid, tsk->pid); ++ current->signal->tty_old_pgrp = 0; ++ set_virt_pgid(tsk, 1); ++ set_virt_sid(tsk, 1); ++ ++ /* It is safe to initialize netfilter here as routing initialization and ++ interface setup will be done below. This means that NO skb can be ++ passed inside. Den */ ++ /* iptables ve initialization for non ve0; ++ ve0 init is in module_init */ ++ if ((err = init_ve_netfilter()) < 0) ++ goto err_netfilter; ++ ++ init_mask = (data)? *((__u64 *)data): VE_IP_DEFAULT; ++ if ((err = init_ve_iptables(ve, init_mask)) < 0) ++ goto err_iptables; ++ ++ vhd.env = ve; ++ vhd.class_id = class_id; ++ vhd.data = data; ++ vhd.datalen = datalen; ++ if ((err = ve_hook_iterate(VE_HOOK_INIT, (void *)&vhd)) < 0) ++ goto err_ve_hook; ++ ++ move_task(tsk, ve, old); ++ ++ ve->is_running = 1; ++ up_write(&ve->op_sem); ++ ++ printk(KERN_INFO "VPS: %d: started\n", veid); ++ return veid; ++ ++err_ve_hook: ++ fini_venet(ve); ++ fini_ve_iptables(ve, init_mask); ++err_iptables: ++ fini_ve_netfilter(); ++err_netfilter: ++ ; ++err_vpid: ++ fini_ve_ipc(ve); ++err_ipc: ++ fini_ve_devpts(ve); ++err_devpts: ++ fini_ve_shmem(ve); ++err_shmem: ++ fini_ve_tty_drivers(ve); ++err_tty: ++ fini_ve_netdev(); ++err_dev: ++ fini_ve_route(ve); ++err_route: ++ fini_ve_sysfs(ve); ++err_sysfs: ++ fini_ve_sysctl(ve); ++err_sysctl: ++ fini_ve_proc(ve); ++err_proc: ++ do_clean_devperms(ve->veid); /* register procfs adds devperms */ ++ fini_ve_mibs(ve); ++err_mibs: ++ /* free_ve_utsname() is called inside real_put_ve() */ ; ++err_utsname: ++ /* It is safe to restore current->envid here because ++ * ve_fairsched_detach does not use current->envid. */ ++ /* Really fairsched code uses current->envid in sys_fairsched_mknod ++ * only. It is correct if sys_fairsched_mknod is called from ++ * userspace. If sys_fairsched_mknod is called from ++ * ve_fairsched_attach, then node->envid and node->parent_node->envid ++ * are explicitly set to valid value after the call. */ ++ /* FIXME */ ++ VE_TASK_INFO(tsk)->owner_env = old; ++ VE_TASK_INFO(tsk)->exec_env = old_exec; ++ /* move user back */ ++ if (set_user(0, 0) < 0) ++ printk(KERN_WARNING"Can't restore UID\n"); ++ ++err_set_user: ++ fini_ve_sched(ve); ++err_sched: ++ (void)set_exec_env(old_exec); ++ ++ /* we can jump here having incorrect envid */ ++ VE_TASK_INFO(tsk)->owner_env = old; ++ fini_printk(ve); ++err_log_wait: ++ ve_list_del(ve); ++ up_write(&ve->op_sem); ++ ++ real_put_ve(ve); ++err_struct: ++ printk(KERN_INFO "VPS: %d: failed to start with err=%d\n", veid, err); ++ return err; ++ ++err_exist: ++ kfree(ve); ++ goto err_struct; ++} ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * VE start/stop callbacks ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++int real_env_create(envid_t veid, unsigned flags, u32 class_id, ++ struct env_create_param *data, int datalen) ++{ ++ int status; ++ struct ve_struct *ve; ++ ++ if (!flags) { ++ status = get_exec_env()->veid; ++ goto out; ++ } ++ ++ status = -EPERM; ++ if (!capable(CAP_SETVEID)) ++ goto out; ++ ++ status = -EINVAL; ++ if ((flags & VE_TEST) && (flags & (VE_ENTER|VE_CREATE))) ++ goto out; ++ ++ status = -EINVAL; ++ ve = get_ve_by_id(veid); ++ if (ve) { ++ if (flags & VE_TEST) { ++ status = 0; ++ goto out_put; ++ } ++ if (flags & VE_EXCLUSIVE) { ++ status = -EACCES; ++ goto out_put; ++ } ++ if (flags & VE_CREATE) { ++ flags &= ~VE_CREATE; ++ flags |= VE_ENTER; ++ } ++ } else { ++ if (flags & (VE_TEST|VE_ENTER)) { ++ status = -ESRCH; ++ goto out; ++ } ++ } ++ ++ if (flags & VE_CREATE) { ++ status = do_env_create(veid, class_id, data, datalen); ++ goto out; ++ } else if (flags & VE_ENTER) ++ status = do_env_enter(ve); ++ ++ /* else: returning EINVAL */ ++ ++out_put: ++ real_put_ve(ve); ++out: ++ return status; ++} ++ ++static int do_env_enter(struct ve_struct *ve) ++{ ++ struct task_struct *tsk = current; ++ int err; ++ ++ VZTRACE("%s: veid=%d\n", __FUNCTION__, ve->veid); ++ ++ err = -EBUSY; ++ down_read(&ve->op_sem); ++ if (!ve->is_running) ++ goto out_up; ++ ++#ifdef CONFIG_FAIRSCHED ++ err = sys_fairsched_mvpr(current->pid, ve->veid); ++ if (err) ++ goto out_up; ++#endif ++ ++ ve_sched_attach(ve); ++ move_task(current, ve, VE_TASK_INFO(tsk)->owner_env); ++ err = VE_TASK_INFO(tsk)->owner_env->veid; ++ ++out_up: ++ up_read(&ve->op_sem); ++ return err; ++} ++ ++static void env_cleanup(struct ve_struct *ve) ++{ ++ struct ve_struct *old_ve; ++ ++ VZTRACE("real_do_env_cleanup\n"); ++ ++ down_read(&ve->op_sem); ++ old_ve = set_exec_env(ve); ++ ++ ve_hook_iterate_cleanup(VE_HOOK_FINI, (void *)ve); ++ ++ fini_venet(ve); ++ ++ /* no new packets in flight beyond this point */ ++ synchronize_net(); ++ /* skb hold dst_entry, and in turn lies in the ip fragment queue */ ++ ip_fragment_cleanup(ve); ++ ++ fini_ve_netdev(); ++ fini_ve_route(ve); ++ ++ /* kill iptables */ ++ /* No skb belonging to VE can exist at this point as unregister_netdev ++ is an operation awaiting until ALL skb's gone */ ++ flush_ve_iptables(ve); ++ fini_ve_iptables(ve, ve->_iptables_modules); ++ fini_ve_netfilter(); ++ ++ ve_ipc_cleanup(); ++ ++ fini_ve_sched(ve); ++ do_clean_devperms(ve->veid); ++ ++ fini_ve_devpts(ve); ++ fini_ve_shmem(ve); ++ fini_ve_sysfs(ve); ++ unregister_ve_tty_drivers(ve); ++ fini_ve_sysctl(ve); ++ fini_ve_proc(ve); ++ ++ fini_ve_mibs(ve); ++ ++ (void)set_exec_env(old_ve); ++ fini_printk(ve); /* no printk can happen in ve context anymore */ ++ ++ ve_list_del(ve); ++ up_read(&ve->op_sem); ++ ++ real_put_ve(ve); ++} ++ ++static struct list_head ve_cleanup_list; ++static spinlock_t ve_cleanup_lock; ++ ++static DECLARE_COMPLETION(vzmond_complete); ++static struct task_struct *vzmond_thread; ++static volatile int stop_vzmond; ++ ++void real_do_env_cleanup(struct ve_struct *ve) ++{ ++ spin_lock(&ve_cleanup_lock); ++ list_add_tail(&ve->cleanup_list, &ve_cleanup_list); ++ spin_unlock(&ve_cleanup_lock); ++ wake_up_process(vzmond_thread); ++} ++ ++static void do_pending_env_cleanups(void) ++{ ++ struct ve_struct *ve; ++ ++ spin_lock(&ve_cleanup_lock); ++ while (1) { ++ if (list_empty(&ve_cleanup_list) || need_resched()) ++ break; ++ ve = list_entry(ve_cleanup_list.next, struct ve_struct, ++ cleanup_list); ++ list_del(&ve->cleanup_list); ++ spin_unlock(&ve_cleanup_lock); ++ env_cleanup(ve); ++ spin_lock(&ve_cleanup_lock); ++ } ++ spin_unlock(&ve_cleanup_lock); ++} ++ ++static int have_pending_cleanups(void) ++{ ++ return !list_empty(&ve_cleanup_list); ++} ++ ++static int vzmond(void *arg) ++{ ++ daemonize("vzmond"); ++ vzmond_thread = current; ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ while (!stop_vzmond) { ++ schedule(); ++ try_to_freeze(); ++ if (signal_pending(current)) ++ flush_signals(current); ++ ++ do_pending_env_cleanups(); ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (have_pending_cleanups()) ++ __set_current_state(TASK_RUNNING); ++ } ++ ++ __set_task_state(current, TASK_RUNNING); ++ complete_and_exit(&vzmond_complete, 0); ++} ++ ++static int __init init_vzmond(void) ++{ ++ INIT_LIST_HEAD(&ve_cleanup_list); ++ spin_lock_init(&ve_cleanup_lock); ++ stop_vzmond = 0; ++ return kernel_thread(vzmond, NULL, 0); ++} ++ ++static void fini_vzmond(void) ++{ ++ stop_vzmond = 1; ++ wake_up_process(vzmond_thread); ++ wait_for_completion(&vzmond_complete); ++ WARN_ON(!list_empty(&ve_cleanup_list)); ++} ++ ++void real_do_env_free(struct ve_struct *ve) ++{ ++ VZTRACE("real_do_env_free\n"); ++ ++ ve_ipc_free(ve); /* free SYSV IPC resources */ ++ free_ve_tty_drivers(ve); ++ free_ve_utsname(ve); ++ free_ve_sysctl(ve); /* free per ve sysctl data */ ++ free_ve_filesystems(ve); ++ printk(KERN_INFO "VPS: %d: stopped\n", VEID(ve)); ++ kfree(ve); ++ ++ module_put(THIS_MODULE); ++} ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * VE TTY handling ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++DCL_VE_OWNER(TTYDRV, struct tty_driver, owner_env) ++ ++static struct tty_driver *alloc_ve_tty_driver(struct tty_driver *base, ++ struct ve_struct *ve) ++{ ++ size_t size; ++ struct tty_driver *driver; ++ ++ driver = kmalloc(sizeof(struct tty_driver), GFP_KERNEL); ++ if (!driver) ++ goto out; ++ ++ memcpy(driver, base, sizeof(struct tty_driver)); ++ ++ driver->driver_state = NULL; ++ ++ size = base->num * 3 * sizeof(void *); ++ if (!(driver->flags & TTY_DRIVER_DEVPTS_MEM)) { ++ void **p; ++ p = kmalloc(size, GFP_KERNEL); ++ if (!p) ++ goto out_free; ++ memset(p, 0, size); ++ driver->ttys = (struct tty_struct **)p; ++ driver->termios = (struct termios **)(p + driver->num); ++ driver->termios_locked = (struct termios **)(p + driver->num * 2); ++ } else { ++ driver->ttys = NULL; ++ driver->termios = NULL; ++ driver->termios_locked = NULL; ++ } ++ ++ SET_VE_OWNER_TTYDRV(driver, ve); ++ driver->flags |= TTY_DRIVER_INSTALLED; ++ ++ return driver; ++ ++out_free: ++ kfree(driver); ++out: ++ return NULL; ++} ++ ++static void free_ve_tty_driver(struct tty_driver *driver) ++{ ++ if (!driver) ++ return; ++ ++ clear_termios(driver); ++ kfree(driver->ttys); ++ kfree(driver); ++} ++ ++static int alloc_ve_tty_drivers(struct ve_struct* ve) ++{ ++#ifdef CONFIG_LEGACY_PTYS ++ /* Traditional BSD devices */ ++ ve->pty_driver = alloc_ve_tty_driver(pty_driver, ve); ++ if (!ve->pty_driver) ++ goto out_mem; ++ ++ ve->pty_slave_driver = alloc_ve_tty_driver(pty_slave_driver, ve); ++ if (!ve->pty_slave_driver) ++ goto out_mem; ++ ++ ve->pty_driver->other = ve->pty_slave_driver; ++ ve->pty_slave_driver->other = ve->pty_driver; ++#endif ++ ++#ifdef CONFIG_UNIX98_PTYS ++ ve->ptm_driver = alloc_ve_tty_driver(ptm_driver, ve); ++ if (!ve->ptm_driver) ++ goto out_mem; ++ ++ ve->pts_driver = alloc_ve_tty_driver(pts_driver, ve); ++ if (!ve->pts_driver) ++ goto out_mem; ++ ++ ve->ptm_driver->other = ve->pts_driver; ++ ve->pts_driver->other = ve->ptm_driver; ++ ++ ve->allocated_ptys = kmalloc(sizeof(*ve->allocated_ptys), GFP_KERNEL); ++ if (!ve->allocated_ptys) ++ goto out_mem; ++ idr_init(ve->allocated_ptys); ++#endif ++ return 0; ++ ++out_mem: ++ free_ve_tty_drivers(ve); ++ return -ENOMEM; ++} ++ ++static void free_ve_tty_drivers(struct ve_struct* ve) ++{ ++#ifdef CONFIG_LEGACY_PTYS ++ free_ve_tty_driver(ve->pty_driver); ++ free_ve_tty_driver(ve->pty_slave_driver); ++ ve->pty_driver = ve->pty_slave_driver = NULL; ++#endif ++#ifdef CONFIG_UNIX98_PTYS ++ free_ve_tty_driver(ve->ptm_driver); ++ free_ve_tty_driver(ve->pts_driver); ++ kfree(ve->allocated_ptys); ++ ve->ptm_driver = ve->pts_driver = NULL; ++ ve->allocated_ptys = NULL; ++#endif ++} ++ ++static inline void __register_tty_driver(struct tty_driver *driver) ++{ ++ list_add(&driver->tty_drivers, &tty_drivers); ++} ++ ++static inline void __unregister_tty_driver(struct tty_driver *driver) ++{ ++ if (!driver) ++ return; ++ list_del(&driver->tty_drivers); ++} ++ ++static int register_ve_tty_drivers(struct ve_struct* ve) ++{ ++ write_lock_irq(&tty_driver_guard); ++#ifdef CONFIG_UNIX98_PTYS ++ __register_tty_driver(ve->ptm_driver); ++ __register_tty_driver(ve->pts_driver); ++#endif ++#ifdef CONFIG_LEGACY_PTYS ++ __register_tty_driver(ve->pty_driver); ++ __register_tty_driver(ve->pty_slave_driver); ++#endif ++ write_unlock_irq(&tty_driver_guard); ++ ++ return 0; ++} ++ ++static void unregister_ve_tty_drivers(struct ve_struct* ve) ++{ ++ VZTRACE("unregister_ve_tty_drivers\n"); ++ ++ write_lock_irq(&tty_driver_guard); ++ __unregister_tty_driver(ve->pty_driver); ++ __unregister_tty_driver(ve->pty_slave_driver); ++#ifdef CONFIG_UNIX98_PTYS ++ __unregister_tty_driver(ve->ptm_driver); ++ __unregister_tty_driver(ve->pts_driver); ++#endif ++ write_unlock_irq(&tty_driver_guard); ++} ++ ++static int init_ve_tty_drivers(struct ve_struct *ve) ++{ ++ int err; ++ ++ if ((err = alloc_ve_tty_drivers(ve))) ++ goto err_ttyalloc; ++ if ((err = register_ve_tty_drivers(ve))) ++ goto err_ttyreg; ++ return 0; ++ ++err_ttyreg: ++ free_ve_tty_drivers(ve); ++err_ttyalloc: ++ return err; ++} ++ ++static void fini_ve_tty_drivers(struct ve_struct *ve) ++{ ++ unregister_ve_tty_drivers(ve); ++ free_ve_tty_drivers(ve); ++} ++ ++/* ++ * Free the termios and termios_locked structures because ++ * we don't want to get memory leaks when modular tty ++ * drivers are removed from the kernel. ++ */ ++static void clear_termios(struct tty_driver *driver) ++{ ++ int i; ++ struct termios *tp; ++ ++ if (driver->termios == NULL) ++ return; ++ for (i = 0; i < driver->num; i++) { ++ tp = driver->termios[i]; ++ if (tp) { ++ driver->termios[i] = NULL; ++ kfree(tp); ++ } ++ tp = driver->termios_locked[i]; ++ if (tp) { ++ driver->termios_locked[i] = NULL; ++ kfree(tp); ++ } ++ } ++} ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * Pieces of VE network ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#include <asm/uaccess.h> ++#include <net/sock.h> ++#include <linux/netlink.h> ++#include <linux/rtnetlink.h> ++#include <net/route.h> ++#include <net/ip_fib.h> ++#endif ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++static void ve_del_ip_addrs(struct net_device *dev) ++{ ++ struct in_device *in_dev; ++ ++ in_dev = in_dev_get(dev); ++ if (in_dev == NULL) ++ return; ++ ++ while (in_dev->ifa_list != NULL) { ++ inet_del_ifa(in_dev, &in_dev->ifa_list, 1); ++ } ++ in_dev_put(in_dev); ++} ++ ++static int ve_netdev_cleanup(struct net_device *dev, int to_ve) ++{ ++ int err; ++ ++ err = 0; ++ ve_del_ip_addrs(dev); ++ if ((dev->flags & IFF_UP) != 0) ++ err = dev_close(dev); ++ synchronize_net(); ++ dev_shutdown(dev); ++ dev_mc_discard(dev); ++ free_divert_blk(dev); ++ synchronize_net(); ++ ++ if (to_ve) ++ dev->orig_mtu = dev->mtu; ++ else { ++ int rc = dev_set_mtu(dev, dev->orig_mtu); ++ if (err == 0) ++ err = rc; ++ } ++ ++ return err; ++} ++ ++static void __ve_dev_move(struct net_device *dev, struct ve_struct *ve_src, ++ struct ve_struct *ve_dst, struct user_beancounter *exec_ub) ++{ ++ struct net_device **dp, *d; ++ struct user_beancounter *ub; ++ ++ for (d = ve_src->_net_dev_base, dp = NULL; d != NULL; ++ dp = &d->next, d = d->next) { ++ if (d == dev) { ++ hlist_del(&dev->name_hlist); ++ hlist_del(&dev->index_hlist); ++ if (ve_src->_net_dev_tail == &dev->next) ++ ve_src->_net_dev_tail = dp; ++ if (dp) ++ *dp = dev->next; ++ dev->next = NULL; ++ break; ++ } ++ } ++ *ve_dst->_net_dev_tail = dev; ++ ve_dst->_net_dev_tail = &dev->next; ++ hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name, ve_dst)); ++ hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex, ve_dst)); ++ dev->owner_env = ve_dst; ++ ++ ub = netdev_bc(dev)->exec_ub; ++ netdev_bc(dev)->exec_ub = get_beancounter(exec_ub); ++ put_beancounter(ub); ++} ++ ++static int ve_dev_add(envid_t veid, char *dev_name) ++{ ++ int err; ++ struct net_device *dev; ++ struct ve_struct *ve; ++ struct hlist_node *p; ++ ++ dev = NULL; ++ err = -ESRCH; ++ ++ ve = get_ve_by_id(veid); ++ if (ve == NULL) ++ goto out; ++ ++ rtnl_lock(); ++ ++ read_lock(&dev_base_lock); ++ hlist_for_each(p, dev_name_hash(dev_name, get_ve0())) { ++ struct net_device *d = hlist_entry(p, struct net_device, ++ name_hlist); ++ if (strncmp(d->name, dev_name, IFNAMSIZ) == 0) { ++ dev = d; ++ break; ++ } ++ } ++ read_unlock(&dev_base_lock); ++ if (dev == NULL) ++ goto out_unlock; ++ ++ err = -EPERM; ++ if (!ve_is_dev_movable(dev)) ++ goto out_unlock; ++ ++ err = -EINVAL; ++ if (dev->flags & (IFF_SLAVE|IFF_MASTER)) ++ goto out_unlock; ++ ++ ve_netdev_cleanup(dev, 1); ++ ++ write_lock_bh(&dev_base_lock); ++ __ve_dev_move(dev, get_ve0(), ve, get_exec_ub()); ++ write_unlock_bh(&dev_base_lock); ++ ++ err = 0; ++ ++out_unlock: ++ rtnl_unlock(); ++ real_put_ve(ve); ++ ++ if (dev == NULL) ++ printk(KERN_WARNING "Device %s not found\n", dev_name); ++ ++out: ++ return err; ++} ++ ++static int ve_dev_del(envid_t veid, char *dev_name) ++{ ++ int err; ++ struct net_device *dev; ++ struct ve_struct *ve, *old_exec; ++ struct hlist_node *p; ++ ++ dev = NULL; ++ err = -ESRCH; ++ ++ ve = get_ve_by_id(veid); ++ if (ve == NULL) ++ goto out; ++ ++ rtnl_lock(); ++ ++ read_lock(&dev_base_lock); ++ hlist_for_each(p, dev_name_hash(dev_name, ve)) { ++ struct net_device *d = hlist_entry(p, struct net_device, ++ name_hlist); ++ if (strncmp(d->name, dev_name, IFNAMSIZ) == 0) { ++ dev = d; ++ break; ++ } ++ } ++ read_unlock(&dev_base_lock); ++ if (dev == NULL) ++ goto out_unlock; ++ ++ err = -EPERM; ++ if (!ve_is_dev_movable(dev)) ++ goto out_unlock; ++ ++ old_exec = set_exec_env(ve); ++ ve_netdev_cleanup(dev, 0); ++ (void)set_exec_env(old_exec); ++ ++ write_lock_bh(&dev_base_lock); ++ __ve_dev_move(dev, ve, get_ve0(), netdev_bc(dev)->owner_ub); ++ write_unlock_bh(&dev_base_lock); ++ ++ err = 0; ++ ++out_unlock: ++ rtnl_unlock(); ++ real_put_ve(ve); ++ ++ if (dev == NULL) ++ printk(KERN_WARNING "Device %s not found\n", dev_name); ++ ++out: ++ return err; ++} ++ ++int real_ve_dev_map(envid_t veid, int op, char *dev_name) ++{ ++ int err; ++ err = -EPERM; ++ if (!capable(CAP_SETVEID)) ++ goto out; ++ switch (op) ++ { ++ case VE_NETDEV_ADD: ++ err = ve_dev_add(veid, dev_name); ++ break; ++ case VE_NETDEV_DEL: ++ err = ve_dev_del(veid, dev_name); ++ break; ++ default: ++ err = -EINVAL; ++ break; ++ } ++out: ++ return err; ++} ++ ++static void ve_mapped_devs_cleanup(struct ve_struct *ve) ++{ ++ struct net_device *dev; ++ ++ rtnl_lock(); ++ write_lock_bh(&dev_base_lock); ++restart: ++ for (dev = ve->_net_dev_base; dev != NULL; dev = dev->next) ++ { ++ if ((dev->features & NETIF_F_VENET) || ++ (dev == ve->_loopback_dev)) /* Skip loopback dev */ ++ continue; ++ write_unlock_bh(&dev_base_lock); ++ ve_netdev_cleanup(dev, 0); ++ write_lock_bh(&dev_base_lock); ++ __ve_dev_move(dev, ve, get_ve0(), netdev_bc(dev)->owner_ub); ++ goto restart; ++ } ++ write_unlock_bh(&dev_base_lock); ++ rtnl_unlock(); ++} ++#endif ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * VE information via /proc ++ * ++ ********************************************************************** ++ **********************************************************************/ ++#ifdef CONFIG_PROC_FS ++static int devperms_seq_show(struct seq_file *m, void *v) ++{ ++ struct devperms_struct *dp; ++ char dev_s[32], type_c; ++ unsigned use, type; ++ dev_t dev; ++ ++ dp = (struct devperms_struct *)v; ++ if (dp == (struct devperms_struct *)1L) { ++ seq_printf(m, "Version: 2.7\n"); ++ return 0; ++ } ++ ++ use = dp->type & VE_USE_MASK; ++ type = dp->type & S_IFMT; ++ dev = dp->dev; ++ ++ if ((use | VE_USE_MINOR) == use) ++ snprintf(dev_s, sizeof(dev_s), "%d:%d", MAJOR(dev), MINOR(dev)); ++ else if ((use | VE_USE_MAJOR) == use) ++ snprintf(dev_s, sizeof(dev_s), "%d:*", MAJOR(dp->dev)); ++ else ++ snprintf(dev_s, sizeof(dev_s), "*:*"); ++ ++ if (type == S_IFCHR) ++ type_c = 'c'; ++ else if (type == S_IFBLK) ++ type_c = 'b'; ++ else ++ type_c = '?'; ++ ++ seq_printf(m, "%10u %c %03o %s\n", dp->veid, type_c, dp->mask, dev_s); ++ return 0; ++} ++ ++static void *devperms_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ loff_t cpos; ++ long slot; ++ struct devperms_struct *dp; ++ ++ cpos = *pos; ++ read_lock(&devperms_hash_guard); ++ if (cpos-- == 0) ++ return (void *)1L; ++ ++ for (slot = 0; slot < DEVPERMS_HASH_SZ; slot++) ++ for (dp = devperms_hash[slot]; dp; dp = dp->devhash_next) ++ if (cpos-- == 0) { ++ m->private = (void *)slot; ++ return dp; ++ } ++ return NULL; ++} ++ ++static void *devperms_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ long slot; ++ struct devperms_struct *dp; ++ ++ dp = (struct devperms_struct *)v; ++ ++ if (dp == (struct devperms_struct *)1L) ++ slot = 0; ++ else if (dp->devhash_next == NULL) ++ slot = (long)m->private + 1; ++ else { ++ (*pos)++; ++ return dp->devhash_next; ++ } ++ ++ for (; slot < DEVPERMS_HASH_SZ; slot++) ++ if (devperms_hash[slot]) { ++ (*pos)++; ++ m->private = (void *)slot; ++ return devperms_hash[slot]; ++ } ++ return NULL; ++} ++ ++static void devperms_seq_stop(struct seq_file *m, void *v) ++{ ++ read_unlock(&devperms_hash_guard); ++} ++ ++static struct seq_operations devperms_seq_op = { ++ .start = devperms_seq_start, ++ .next = devperms_seq_next, ++ .stop = devperms_seq_stop, ++ .show = devperms_seq_show, ++}; ++ ++static int devperms_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &devperms_seq_op); ++} ++ ++static struct file_operations proc_devperms_ops = { ++ .open = devperms_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++#if BITS_PER_LONG == 32 ++#define VESTAT_LINE_WIDTH (6 * 11 + 6 * 21) ++#define VESTAT_LINE_FMT "%10u %10lu %10lu %10lu %10lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %10lu\n" ++#define VESTAT_HEAD_FMT "%10s %10s %10s %10s %10s %20s %20s %20s %20s %20s %20s %10s\n" ++#else ++#define VESTAT_LINE_WIDTH (12 * 21) ++#define VESTAT_LINE_FMT "%20u %20lu %20lu %20lu %20lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20lu\n" ++#define VESTAT_HEAD_FMT "%20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n" ++#endif ++ ++static int vestat_seq_show(struct seq_file *m, void *v) ++{ ++ struct ve_struct *ve = (struct ve_struct *)v; ++ struct ve_struct *curve; ++ int cpu; ++ unsigned long user_ve, nice_ve, system_ve, uptime; ++ cycles_t uptime_cycles, idle_time, strv_time, used; ++ ++ curve = get_exec_env(); ++ if (ve == ve_list_head || ++ (!ve_is_super(curve) && ve == curve)) { ++ /* print header */ ++ seq_printf(m, "%-*s\n", ++ VESTAT_LINE_WIDTH - 1, ++ "Version: 2.2"); ++ seq_printf(m, VESTAT_HEAD_FMT, "VEID", ++ "user", "nice", "system", ++ "uptime", "idle", ++ "strv", "uptime", "used", ++ "maxlat", "totlat", "numsched"); ++ } ++ ++ if (ve == get_ve0()) ++ return 0; ++ ++ user_ve = nice_ve = system_ve = 0; ++ idle_time = strv_time = used = 0; ++ ++ for (cpu = 0; cpu < NR_CPUS; cpu++) { ++ struct ve_cpu_stats *st; ++ ++ st = VE_CPU_STATS(ve, cpu); ++ user_ve += st->user; ++ nice_ve += st->nice; ++ system_ve += st->system; ++ used += VE_CPU_STATS(ve, cpu)->used_time; ++ idle_time += ve_sched_get_idle_time(ve, cpu); ++ } ++ uptime_cycles = get_cycles() - ve->start_cycles; ++ uptime = jiffies - ve->start_jiffies; ++ ++ seq_printf(m, VESTAT_LINE_FMT, ve->veid, ++ user_ve, nice_ve, system_ve, ++ uptime, idle_time, ++ strv_time, uptime_cycles, used, ++ ve->sched_lat_ve.last.maxlat, ++ ve->sched_lat_ve.last.totlat, ++ ve->sched_lat_ve.last.count); ++ return 0; ++} ++ ++static void *ve_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ struct ve_struct *ve, *curve; ++ loff_t l; ++ ++ curve = get_exec_env(); ++ read_lock(&ve_list_guard); ++ if (!ve_is_super(curve)) { ++ if (*pos != 0) ++ return NULL; ++ return curve; ++ } ++ for (ve = ve_list_head, l = *pos; ++ ve != NULL && l > 0; ++ ve = ve->next, l--); ++ return ve; ++} ++ ++static void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ struct ve_struct *ve = (struct ve_struct *)v; ++ ++ if (!ve_is_super(get_exec_env())) ++ return NULL; ++ (*pos)++; ++ return ve->next; ++} ++ ++static void ve_seq_stop(struct seq_file *m, void *v) ++{ ++ read_unlock(&ve_list_guard); ++} ++ ++static struct seq_operations vestat_seq_op = { ++ start: ve_seq_start, ++ next: ve_seq_next, ++ stop: ve_seq_stop, ++ show: vestat_seq_show ++}; ++ ++static int vestat_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &vestat_seq_op); ++} ++ ++static struct file_operations proc_vestat_operations = { ++ open: vestat_open, ++ read: seq_read, ++ llseek: seq_lseek, ++ release: seq_release ++}; ++ ++static int __init init_vecalls_proc(void) ++{ ++ struct proc_dir_entry *de; ++ ++ de = create_proc_glob_entry("vz/vestat", ++ S_IFREG|S_IRUSR, NULL); ++ if (de == NULL) { ++ /* create "vz" subdirectory, if not exist */ ++ (void) create_proc_glob_entry("vz", ++ S_IFDIR|S_IRUGO|S_IXUGO, NULL); ++ de = create_proc_glob_entry("vz/vestat", ++ S_IFREG|S_IRUSR, NULL); ++ } ++ if (de) ++ de->proc_fops = &proc_vestat_operations; ++ else ++ printk(KERN_WARNING ++ "VZMON: can't make vestat proc entry\n"); ++ ++ de = create_proc_entry("vz/devperms", S_IFREG | S_IRUSR, NULL); ++ if (de) ++ de->proc_fops = &proc_devperms_ops; ++ else ++ printk(KERN_WARNING ++ "VZMON: can't make devperms proc entry\n"); ++ return 0; ++} ++ ++static void fini_vecalls_proc(void) ++{ ++ remove_proc_entry("vz/devperms", NULL); ++ remove_proc_entry("vz/vestat", NULL); ++} ++#else ++#define init_vecalls_proc() (0) ++#define fini_vecalls_proc() do { } while (0) ++#endif /* CONFIG_PROC_FS */ ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * User ctl ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++int vzcalls_ioctl(struct inode *, struct file *, unsigned int, unsigned long); ++static struct vzioctlinfo vzcalls = { ++ type: VZCTLTYPE, ++ func: vzcalls_ioctl, ++ owner: THIS_MODULE, ++}; ++ ++int vzcalls_ioctl(struct inode *ino, struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err; ++ ++ err = -ENOTTY; ++ switch(cmd) { ++ case VZCTL_MARK_ENV_TO_DOWN: { ++ /* Compatibility issue */ ++ err = 0; ++ } ++ break; ++ case VZCTL_SETDEVPERMS: { ++ /* Device type was mistakenly declared as dev_t ++ * in the old user-kernel interface. ++ * That's wrong, dev_t is a kernel internal type. ++ * I use `unsigned' not having anything better in mind. ++ * 2001/08/11 SAW */ ++ struct vzctl_setdevperms s; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void *)arg, sizeof(s))) ++ break; ++ err = real_setdevperms(s.veid, s.type, ++ new_decode_dev(s.dev), s.mask); ++ } ++ break; ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ case VZCTL_VE_NETDEV: { ++ struct vzctl_ve_netdev d; ++ char *s; ++ err = -EFAULT; ++ if (copy_from_user(&d, (void *)arg, sizeof(d))) ++ break; ++ err = -ENOMEM; ++ s = kmalloc(IFNAMSIZ+1, GFP_KERNEL); ++ if (s == NULL) ++ break; ++ err = -EFAULT; ++ if (strncpy_from_user(s, d.dev_name, IFNAMSIZ) > 0) { ++ s[IFNAMSIZ] = 0; ++ err = real_ve_dev_map(d.veid, d.op, s); ++ } ++ kfree(s); ++ } ++ break; ++#endif ++ case VZCTL_ENV_CREATE: { ++ struct vzctl_env_create s; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void *)arg, sizeof(s))) ++ break; ++ err = real_env_create(s.veid, s.flags, s.class_id, ++ NULL, 0); ++ } ++ break; ++ case VZCTL_ENV_CREATE_DATA: { ++ struct vzctl_env_create_data s; ++ struct env_create_param *data; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void *)arg, sizeof(s))) ++ break; ++ err=-EINVAL; ++ if (s.datalen < VZCTL_ENV_CREATE_DATA_MINLEN || ++ s.datalen > VZCTL_ENV_CREATE_DATA_MAXLEN || ++ s.data == 0) ++ break; ++ err = -ENOMEM; ++ data = kmalloc(s.datalen, GFP_KERNEL); ++ if (!data) ++ break; ++ err = -EFAULT; ++ if (copy_from_user(data, (void *)s.data, s.datalen)) ++ goto free_data; ++ err = real_env_create(s.veid, s.flags, s.class_id, ++ data, s.datalen); ++free_data: ++ kfree(data); ++ } ++ break; ++ case VZCTL_GET_CPU_STAT: { ++ struct vzctl_cpustatctl s; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void *)arg, sizeof(s))) ++ break; ++ err = ve_get_cpu_stat(s.veid, s.cpustat); ++ } ++ break; ++ } ++ return err; ++} ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * Init/exit stuff ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++#ifdef CONFIG_VE_CALLS_MODULE ++static int __init init_vecalls_symbols(void) ++{ ++ KSYMRESOLVE(real_get_device_perms_ve); ++ KSYMRESOLVE(real_do_env_cleanup); ++ KSYMRESOLVE(real_do_env_free); ++ KSYMRESOLVE(real_update_load_avg_ve); ++ KSYMMODRESOLVE(vzmon); ++ return 0; ++} ++ ++static void fini_vecalls_symbols(void) ++{ ++ KSYMMODUNRESOLVE(vzmon); ++ KSYMUNRESOLVE(real_get_device_perms_ve); ++ KSYMUNRESOLVE(real_do_env_cleanup); ++ KSYMUNRESOLVE(real_do_env_free); ++ KSYMUNRESOLVE(real_update_load_avg_ve); ++} ++#else ++#define init_vecalls_symbols() (0) ++#define fini_vecalls_symbols() do { } while (0) ++#endif ++ ++static inline __init int init_vecalls_ioctls(void) ++{ ++ vzioctl_register(&vzcalls); ++ return 0; ++} ++ ++static inline void fini_vecalls_ioctls(void) ++{ ++ vzioctl_unregister(&vzcalls); ++} ++ ++static int __init vecalls_init(void) ++{ ++ int err; ++ int i; ++ ++ ve_list_head = get_ve0(); ++ ++ err = init_vzmond(); ++ if (err < 0) ++ goto out_vzmond; ++ ++ err = init_devperms_hash(); ++ if (err < 0) ++ goto out_perms; ++ ++ err = init_vecalls_symbols(); ++ if (err < 0) ++ goto out_sym; ++ ++ err = init_vecalls_proc(); ++ if (err < 0) ++ goto out_proc; ++ ++ err = init_vecalls_ioctls(); ++ if (err < 0) ++ goto out_ioctls; ++ ++ for (i = 0; i < VE_MAX_HOOKS; i++) ++ INIT_LIST_HEAD(&ve_hooks[i]); ++ ++ return 0; ++ ++out_ioctls: ++ fini_vecalls_proc(); ++out_proc: ++ fini_vecalls_symbols(); ++out_sym: ++ fini_devperms_hash(); ++out_perms: ++ fini_vzmond(); ++out_vzmond: ++ return err; ++} ++ ++static void vecalls_exit(void) ++{ ++ fini_vecalls_ioctls(); ++ fini_vecalls_proc(); ++ fini_vecalls_symbols(); ++ fini_devperms_hash(); ++ fini_vzmond(); ++} ++ ++EXPORT_SYMBOL(get_ve_by_id); ++EXPORT_SYMBOL(__find_ve_by_id); ++EXPORT_SYMBOL(ve_list_guard); ++EXPORT_SYMBOL(ve_list_head); ++EXPORT_SYMBOL(nr_ve); ++ ++MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); ++MODULE_DESCRIPTION("Virtuozzo Control"); ++MODULE_LICENSE("GPL v2"); ++ ++module_init(vecalls_init) ++module_exit(vecalls_exit) +diff -uprN linux-2.6.15.orig/kernel/veowner.c linux-2.6.15-ve025stab014/kernel/veowner.c +--- linux-2.6.15.orig/kernel/veowner.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/veowner.c 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,297 @@ ++/* ++ * kernel/veowner.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/sched.h> ++#include <linux/ve.h> ++#include <linux/ve_owner.h> ++#include <linux/ve_proto.h> ++#include <linux/ipc.h> ++#include <linux/fs.h> ++#include <linux/proc_fs.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/delay.h> ++#include <linux/vmalloc.h> ++#include <linux/init.h> ++#include <linux/module.h> ++#include <linux/list.h> ++#include <asm/system.h> ++#include <asm/io.h> ++ ++#include <net/tcp.h> ++ ++void prepare_ve0_process(struct task_struct *tsk) ++{ ++ set_virt_pid(tsk, tsk->pid); ++ set_virt_tgid(tsk, tsk->tgid); ++ if (tsk->signal) { ++ set_virt_pgid(tsk, tsk->signal->pgrp); ++ set_virt_sid(tsk, tsk->signal->session); ++ } ++ VE_TASK_INFO(tsk)->exec_env = get_ve0(); ++ VE_TASK_INFO(tsk)->owner_env = get_ve0(); ++ VE_TASK_INFO(tsk)->sleep_time = 0; ++ VE_TASK_INFO(tsk)->wakeup_stamp = 0; ++ VE_TASK_INFO(tsk)->sched_time = 0; ++ seqcount_init(&VE_TASK_INFO(tsk)->wakeup_lock); ++ ++ if (tsk->pid) { ++ SET_VE_LINKS(tsk); ++ atomic_inc(&get_ve0()->pcounter); ++ } ++} ++ ++void prepare_ve0_loopback(void) ++{ ++ get_ve0()->_loopback_dev = &loopback_dev; ++} ++ ++/* ++ * ------------------------------------------------------------------------ ++ * proc entries ++ * ------------------------------------------------------------------------ ++ */ ++ ++static void proc_move(struct proc_dir_entry *ddir, ++ struct proc_dir_entry *sdir, ++ const char *name) ++{ ++ struct proc_dir_entry **p, *q; ++ int len; ++ ++ len = strlen(name); ++ for (p = &sdir->subdir, q = *p; q != NULL; p = &q->next, q = *p) ++ if (proc_match(len, name, q)) ++ break; ++ if (q == NULL) ++ return; ++ *p = q->next; ++ q->parent = ddir; ++ q->next = ddir->subdir; ++ ddir->subdir = q; ++} ++static void prepare_proc_misc(void) ++{ ++ static char *table[] = { ++ "loadavg", ++ "uptime", ++ "meminfo", ++ "version", ++ "stat", ++ "filesystems", ++ "locks", ++ "swaps", ++ "mounts", ++ "net", ++ "cpuinfo", ++ "sysvipc", ++ "sys", ++ "fs", ++ "vz", ++ "user_beancounters", ++ "cmdline", ++ "vmstat", ++ NULL, ++ }; ++ char **p; ++ ++ for (p = table; *p != NULL; p++) ++ proc_move(&proc_root, ve0.proc_root, *p); ++} ++int prepare_proc(void) ++{ ++ struct ve_struct *envid; ++ struct proc_dir_entry *de; ++ struct proc_dir_entry *ve_root; ++ ++ envid = set_exec_env(&ve0); ++ ve_root = ve0.proc_root->subdir; ++ /* move the whole tree to be visible in VE0 only */ ++ ve0.proc_root->subdir = proc_root.subdir; ++ for (de = ve0.proc_root->subdir; de->next != NULL; de = de->next) ++ de->parent = ve0.proc_root; ++ de->parent = ve0.proc_root; ++ de->next = ve_root; ++ ++ /* move back into the global scope some specific entries */ ++ proc_root.subdir = NULL; ++ prepare_proc_misc(); ++ proc_net = proc_mkdir("net", ve0.proc_root); ++ proc_net_stat = proc_mkdir("stat", proc_net); ++ proc_mkdir("vz", 0); ++#ifdef CONFIG_SYSVIPC ++ proc_mkdir("sysvipc", 0); ++#endif ++ proc_root_fs = proc_mkdir("fs", 0); ++ /* XXX proc_tty_init(); */ ++ ++ /* XXX process inodes */ ++ ++ (void)set_exec_env(envid); ++ ++ (void)create_proc_glob_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL); ++ return 0; ++} ++ ++static struct proc_dir_entry ve0_proc_root = { ++ .name = "/proc", ++ .namelen = 5, ++ .mode = S_IFDIR | S_IRUGO | S_IXUGO, ++ .nlink = 2 ++}; ++ ++void prepare_ve0_proc_root(void) ++{ ++ ve0.proc_root = &ve0_proc_root; ++} ++ ++/* ++ * ------------------------------------------------------------------------ ++ * Virtualized sysctl ++ * ------------------------------------------------------------------------ ++ */ ++ ++static int semmin[4] = { 1, 1, 1, 1 }; ++static int semmax[4] = { 8000, INT_MAX, 1000, IPCMNI }; ++static ctl_table kern_table[] = { ++ {KERN_NODENAME, "hostname", system_utsname.nodename, 64, ++ 0644, NULL, &proc_doutsstring, &sysctl_string}, ++ {KERN_DOMAINNAME, "domainname", system_utsname.domainname, 64, ++ 0644, NULL, &proc_doutsstring, &sysctl_string}, ++#ifdef CONFIG_SYSVIPC ++#define get_ve0_field(fname) &ve0._##fname ++ {KERN_SHMMAX, "shmmax", get_ve0_field(shm_ctlmax), sizeof (size_t), ++ 0644, NULL, &proc_doulongvec_minmax }, ++ {KERN_SHMALL, "shmall", get_ve0_field(shm_ctlall), sizeof (size_t), ++ 0644, NULL, &proc_doulongvec_minmax }, ++ {KERN_SHMMNI, "shmmni", get_ve0_field(shm_ctlmni), sizeof (int), ++ 0644, NULL, &proc_dointvec_minmax, NULL, ++ NULL, &semmin[0], &semmax[3] }, ++ {KERN_MSGMAX, "msgmax", get_ve0_field(msg_ctlmax), sizeof (int), ++ 0644, NULL, &proc_dointvec }, ++ {KERN_MSGMNI, "msgmni", get_ve0_field(msg_ctlmni), sizeof (int), ++ 0644, NULL, &proc_dointvec_minmax, NULL, ++ NULL, &semmin[0], &semmax[3] }, ++ {KERN_MSGMNB, "msgmnb", get_ve0_field(msg_ctlmnb), sizeof (int), ++ 0644, NULL, &proc_dointvec }, ++ {KERN_SEM, "sem", get_ve0_field(sem_ctls), 4*sizeof (int), ++ 0644, NULL, &proc_dointvec }, ++#endif ++ {0} ++}; ++static ctl_table root_table[] = { ++ {CTL_KERN, "kernel", NULL, 0, 0555, kern_table}, ++ {0} ++}; ++extern int ip_rt_src_check; ++extern int ve_area_access_check; ++static ctl_table vz_ipv4_route_table[] = { ++ { ++ ctl_name: NET_IPV4_ROUTE_SRC_CHECK, ++ procname: "src_check", ++ data: &ip_rt_src_check, ++ maxlen: sizeof(int), ++ mode: 0644, ++ proc_handler: &proc_dointvec, ++ }, ++ { 0 } ++}; ++static ctl_table vz_ipv4_table[] = { ++ {NET_IPV4_ROUTE, "route", NULL, 0, 0555, vz_ipv4_route_table}, ++ { 0 } ++}; ++static ctl_table vz_net_table[] = { ++ {NET_IPV4, "ipv4", NULL, 0, 0555, vz_ipv4_table}, ++ { 0 } ++}; ++static ctl_table vz_fs_table[] = { ++ { ++ ctl_name: 226, ++ procname: "ve-area-access-check", ++ data: &ve_area_access_check, ++ maxlen: sizeof(int), ++ mode: 0644, ++ proc_handler: &proc_dointvec, ++ }, ++ { 0 } ++}; ++static ctl_table root_table2[] = { ++ {CTL_NET, "net", NULL, 0, 0555, vz_net_table}, ++ {CTL_FS, "fs", NULL, 0, 0555, vz_fs_table}, ++ { 0 } ++}; ++int prepare_sysctl(void) ++{ ++ struct ve_struct *envid; ++ ++ envid = set_exec_env(&ve0); ++ ve0.kern_header = register_sysctl_table(root_table, 1); ++ register_sysctl_table(root_table2, 0); ++ (void)set_exec_env(envid); ++ return 0; ++} ++ ++void prepare_ve0_sysctl(void) ++{ ++ INIT_LIST_HEAD(&ve0.sysctl_lh); ++#ifdef CONFIG_SYSCTL ++ ve0.proc_sys_root = proc_mkdir("sys", 0); ++#endif ++} ++ ++/* ++ * ------------------------------------------------------------------------ ++ * XXX init_ve_system ++ * ------------------------------------------------------------------------ ++ */ ++ ++void init_ve_system(void) ++{ ++ struct task_struct *init_entry, *p, *tsk; ++ struct ve_struct *ptr; ++ unsigned long flags; ++ int i; ++ ++ ptr = get_ve0(); ++ (void)get_ve(ptr); ++ atomic_set(&ptr->pcounter, 1); ++ ++ /* Don't forget about idle tasks */ ++ write_lock_irqsave(&tasklist_lock, flags); ++ for (i = 0; i < NR_CPUS; i++) { ++ tsk = idle_task(i); ++ if (tsk == NULL) ++ continue; ++ ++ prepare_ve0_process(tsk); ++ } ++ do_each_thread_all(p, tsk) { ++ prepare_ve0_process(tsk); ++ } while_each_thread_all(p, tsk); ++ write_unlock_irqrestore(&tasklist_lock, flags); ++ ++ init_entry = child_reaper; ++ ptr->init_entry = init_entry; ++ /* XXX: why? */ ++ cap_set_full(ptr->cap_default); ++ ++ ptr->_ipv4_devconf = &ipv4_devconf; ++ ptr->_ipv4_devconf_dflt = &ipv4_devconf_dflt; ++ ++ read_lock(&init_entry->fs->lock); ++ ptr->fs_rootmnt = init_entry->fs->rootmnt; ++ ptr->fs_root = init_entry->fs->root; ++ read_unlock(&init_entry->fs->lock); ++ ++ /* common prepares */ ++ prepare_proc(); ++ prepare_sysctl(); ++ prepare_ipc(); ++} +diff -uprN linux-2.6.15.orig/kernel/vzcompat.c linux-2.6.15-ve025stab014/kernel/vzcompat.c +--- linux-2.6.15.orig/kernel/vzcompat.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/vzcompat.c 2006-01-27 14:48:09.000000000 +0300 +@@ -0,0 +1,54 @@ ++#include <linux/config.h> ++#include <linux/fs.h> ++#include <linux/proc_fs.h> ++ ++ ++static ssize_t fake_fairsced_read(struct file *f, char __user *buf, ++ size_t len, loff_t *pos) ++{ ++ return 0; ++} ++ ++static struct file_operations fake_ops = { ++ .read = fake_fairsced_read, ++}; ++ ++ ++/* ++ * Module init/exit. ++ */ ++ ++static int __init ovzcompat_init(void) ++{ ++ int err; ++ struct proc_dir_entry *de; ++ ++ err = -ENOMEM; ++ ++ de = create_proc_glob_entry("fairsched", S_IRUGO, NULL); ++ if (!de) ++ goto err_proc; ++ else ++ de->proc_fops = &fake_ops; ++ ++ de = create_proc_glob_entry("fairsched2", S_IRUGO, NULL); ++ if (!de) ++ goto err_proc2; ++ else ++ de->proc_fops = &fake_ops; ++ return 0; ++ ++err_proc2: ++ remove_proc_entry("fairsched", NULL); ++err_proc: ++ return err; ++} ++ ++void __exit ovzcompat_exit(void) ++{ ++ remove_proc_entry("fairsched2", NULL); ++ remove_proc_entry("fairsched", NULL); ++} ++ ++module_init(ovzcompat_init) ++module_exit(ovzcompat_exit) +diff -uprN linux-2.6.15.orig/kernel/vzdev.c linux-2.6.15-ve025stab014/kernel/vzdev.c +--- linux-2.6.15.orig/kernel/vzdev.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/vzdev.c 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,97 @@ ++/* ++ * kernel/vzdev.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/fs.h> ++#include <linux/list.h> ++#include <linux/init.h> ++#include <linux/module.h> ++#include <linux/vzctl.h> ++#include <linux/slab.h> ++#include <linux/vmalloc.h> ++#include <linux/vzcalluser.h> ++#include <asm/uaccess.h> ++#include <asm/pgalloc.h> ++ ++#define VZCTL_MAJOR 126 ++#define VZCTL_NAME "vzctl" ++ ++MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); ++MODULE_DESCRIPTION("Virtuozzo Interface"); ++MODULE_LICENSE("GPL v2"); ++ ++static LIST_HEAD(ioctls); ++static spinlock_t ioctl_lock = SPIN_LOCK_UNLOCKED; ++ ++int vzctl_ioctl(struct inode *ino, struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err; ++ struct list_head *p; ++ struct vzioctlinfo *inf; ++ ++ err = -ENOTTY; ++ spin_lock(&ioctl_lock); ++ list_for_each(p, &ioctls) { ++ inf = list_entry(p, struct vzioctlinfo, list); ++ if (inf->type != _IOC_TYPE(cmd)) ++ continue; ++ ++ err = try_module_get(inf->owner) ? 0 : -EBUSY; ++ spin_unlock(&ioctl_lock); ++ if (!err) { ++ err = (*inf->func)(ino, file, cmd, arg); ++ module_put(inf->owner); ++ } ++ return err; ++ } ++ spin_unlock(&ioctl_lock); ++ return err; ++} ++ ++void vzioctl_register(struct vzioctlinfo *inf) ++{ ++ spin_lock(&ioctl_lock); ++ list_add(&inf->list, &ioctls); ++ spin_unlock(&ioctl_lock); ++} ++ ++void vzioctl_unregister(struct vzioctlinfo *inf) ++{ ++ spin_lock(&ioctl_lock); ++ list_del_init(&inf->list); ++ spin_unlock(&ioctl_lock); ++} ++ ++EXPORT_SYMBOL(vzioctl_register); ++EXPORT_SYMBOL(vzioctl_unregister); ++ ++/* ++ * Init/exit stuff. ++ */ ++static struct file_operations vzctl_fops = { ++ .owner = THIS_MODULE, ++ .ioctl = vzctl_ioctl, ++}; ++ ++static void __exit vzctl_exit(void) ++{ ++ unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME); ++} ++ ++static int __init vzctl_init(void) ++{ ++ int ret; ++ ++ ret = register_chrdev(VZCTL_MAJOR, VZCTL_NAME, &vzctl_fops); ++ return ret; ++} ++ ++module_init(vzctl_init) ++module_exit(vzctl_exit); +diff -uprN linux-2.6.15.orig/kernel/vzwdog.c linux-2.6.15-ve025stab014/kernel/vzwdog.c +--- linux-2.6.15.orig/kernel/vzwdog.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.15-ve025stab014/kernel/vzwdog.c 2006-01-27 14:48:08.000000000 +0300 +@@ -0,0 +1,278 @@ ++/* ++ * kernel/vzwdog.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/sched.h> ++#include <linux/fs.h> ++#include <linux/list.h> ++#include <linux/ctype.h> ++#include <linux/kobject.h> ++#include <linux/genhd.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/kernel_stat.h> ++#include <linux/smp_lock.h> ++#include <linux/errno.h> ++#include <linux/suspend.h> ++#include <linux/ve.h> ++#include <linux/vzstat.h> ++ ++/* Staff regading kernel thread polling VE validity */ ++static int sleep_timeout = 60; ++static pid_t wdog_thread_pid; ++static int wdog_thread_continue = 1; ++static DECLARE_COMPLETION(license_thread_exited); ++ ++extern void show_mem(void); ++extern struct ve_struct *ve_list_head; ++ ++#if 0 ++static char page[PAGE_SIZE]; ++ ++static void parse_irq_list(int len) ++{ ++ int i, k, skip; ++ for (i = 0; i < len; ) { ++ k = i; ++ while (i < len && page[i] != '\n' && page[i] != ':') ++ i++; ++ skip = 0; ++ if (i < len && page[i] != '\n') { ++ i++; /* skip ':' */ ++ while (i < len && (page[i] == ' ' || page[i] == '0')) ++ i++; ++ skip = (i < len && (page[i] < '0' || page[i] > '9')); ++ while (i < len && page[i] != '\n') ++ i++; ++ } ++ if (!skip) ++ printk("\n%.*s", i - k, page + k); ++ if (i < len) ++ i++; /* skip '\n' */ ++ } ++} ++#endif ++ ++static void show_irq_list(void) ++{ ++#if 0 ++ i = KSYMSAFECALL(int, get_irq_list, (page)); ++ parse_irq_list(i); /* Safe, zero was returned if unassigned */ ++#endif ++} ++ ++static void show_alloc_latency(void) ++{ ++ static const char *alloc_descr[KSTAT_ALLOCSTAT_NR] = { ++ "A0", ++ "L0", ++ "H0", ++ "L1", ++ "H1" ++ }; ++ int i; ++ ++ printk("lat: "); ++ for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) { ++ struct kstat_lat_struct *p; ++ cycles_t maxlat, avg0, avg1, avg2; ++ ++ p = &kstat_glob.alloc_lat[i]; ++ spin_lock_irq(&kstat_glb_lock); ++ maxlat = p->last.maxlat; ++ avg0 = p->avg[0]; ++ avg1 = p->avg[1]; ++ avg2 = p->avg[2]; ++ spin_unlock_irq(&kstat_glb_lock); ++ ++ printk("%s %Lu (%Lu %Lu %Lu)", ++ alloc_descr[i], ++ maxlat, ++ avg0, ++ avg1, ++ avg2); ++ } ++ printk("\n"); ++} ++ ++static void show_schedule_latency(void) ++{ ++ struct kstat_lat_pcpu_struct *p; ++ cycles_t maxlat, totlat, avg0, avg1, avg2; ++ unsigned long count; ++ ++ p = &kstat_glob.sched_lat; ++ spin_lock_irq(&kstat_glb_lock); ++ maxlat = p->last.maxlat; ++ totlat = p->last.totlat; ++ count = p->last.count; ++ avg0 = p->avg[0]; ++ avg1 = p->avg[1]; ++ avg2 = p->avg[2]; ++ spin_unlock_irq(&kstat_glb_lock); ++ ++ printk("sched lat: %Lu/%Lu/%lu (%Lu %Lu %Lu)\n", ++ maxlat, ++ totlat, ++ count, ++ avg0, ++ avg1, ++ avg2); ++} ++ ++static void show_header(void) ++{ ++ struct timeval tv; ++ ++ do_gettimeofday(&tv); ++ printk("*** VZWDOG 1.14: time %lu.%06lu uptime %Lu CPU %d ***\n", ++ tv.tv_sec, tv.tv_usec, ++ get_jiffies_64(), smp_processor_id()); ++#ifdef CONFIG_FAIRSCHED ++ printk("*** cycles_per_jiffy %lu jiffies_per_second %u ***\n", ++ cycles_per_jiffy, HZ); ++#else ++ printk("*** jiffies_per_second %u ***\n", HZ); ++#endif ++} ++ ++static void show_pgdatinfo(void) ++{ ++ pg_data_t *pgdat; ++ ++ printk("pgdat:"); ++ for_each_pgdat(pgdat) { ++ printk(" %d: %lu,%lu,%lu,%p", ++ pgdat->node_id, ++ pgdat->node_start_pfn, ++ pgdat->node_present_pages, ++ pgdat->node_spanned_pages, ++ pgdat->node_mem_map); ++ } ++ printk("\n"); ++} ++ ++static void show_diskio(void) ++{ ++ struct gendisk *gd; ++ char buf[BDEVNAME_SIZE]; ++ ++ printk("disk_io: "); ++ ++ down_read(&block_subsys.rwsem); ++ list_for_each_entry(gd, &block_subsys.kset.list, kobj.entry) { ++ char *name; ++ name = disk_name(gd, 0, buf); ++ if ((strlen(name) > 4) && (strncmp(name, "loop", 4) == 0) && ++ isdigit(name[4])) ++ continue; ++ if ((strlen(name) > 3) && (strncmp(name, "ram", 3) == 0) && ++ isdigit(name[3])) ++ continue; ++ printk("(%u,%u) %s r(%u %u %u) w(%u %u %u)\n", ++ gd->major, gd->first_minor, ++ name, ++ disk_stat_read(gd, ios[READ]), ++ disk_stat_read(gd, sectors[READ]), ++ disk_stat_read(gd, merges[READ]), ++ disk_stat_read(gd, ios[WRITE]), ++ disk_stat_read(gd, sectors[WRITE]), ++ disk_stat_read(gd, merges[WRITE])); ++ } ++ up_read(&block_subsys.rwsem); ++ ++ printk("\n"); ++} ++ ++static void show_nrprocs(void) ++{ ++ unsigned long _nr_running, _nr_sleeping, ++ _nr_unint, _nr_zombie, _nr_dead, _nr_stopped; ++ ++ _nr_running = nr_running(); ++ _nr_unint = nr_uninterruptible(); ++ _nr_sleeping = nr_sleeping(); ++ _nr_zombie = nr_zombie; ++ _nr_dead = nr_dead; ++ _nr_stopped = nr_stopped(); ++ ++ printk("VEnum: %d, proc R %lu, S %lu, D %lu, " ++ "Z %lu, X %lu, T %lu (tot %d)\n", ++ nr_ve, _nr_running, _nr_sleeping, _nr_unint, ++ _nr_zombie, _nr_dead, _nr_stopped, nr_threads); ++} ++ ++static void wdog_print(void) ++{ ++ show_header(); ++ show_irq_list(); ++ show_pgdatinfo(); ++ show_mem(); ++ show_diskio(); ++ show_schedule_latency(); ++ show_alloc_latency(); ++ show_nrprocs(); ++} ++ ++static int wdog_loop(void* data) ++{ ++ struct task_struct *tsk = current; ++ DECLARE_WAIT_QUEUE_HEAD(thread_wait_queue); ++ ++ /* ++ * This thread doesn't need any user-level access, ++ * so get rid of all our resources ++ */ ++ daemonize("wdogd"); ++ ++ spin_lock_irq(&tsk->sighand->siglock); ++ sigfillset(&tsk->blocked); ++ sigdelset(&tsk->blocked, SIGHUP); ++ recalc_sigpending(); ++ spin_unlock_irq(&tsk->sighand->siglock); ++ ++ while (wdog_thread_continue) { ++ wdog_print(); ++ interruptible_sleep_on_timeout(&thread_wait_queue, ++ sleep_timeout*HZ); ++ try_to_freeze(); ++ /* clear all signals */ ++ if (signal_pending(tsk)) ++ flush_signals(tsk); ++ } ++ ++ complete_and_exit(&license_thread_exited, 0); ++} ++ ++static int __init wdog_init(void) ++{ ++ wdog_thread_pid = kernel_thread(wdog_loop, NULL, 0); ++ if (wdog_thread_pid < 0) ++ return wdog_thread_pid; ++ ++ return 0; ++} ++ ++static void __exit wdog_exit(void) ++{ ++ wdog_thread_continue = 0; ++ if (wdog_thread_pid > 0) { ++ kill_proc(wdog_thread_pid, SIGHUP, 1); ++ wait_for_completion(&license_thread_exited); ++ } ++} ++ ++MODULE_PARM(sleep_timeout, "i"); ++MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); ++MODULE_DESCRIPTION("Virtuozzo WDOG"); ++MODULE_LICENSE("GPL v2"); ++ ++module_init(wdog_init) ++module_exit(wdog_exit) +diff -uprN linux-2.6.15.orig/mm/filemap_xip.c linux-2.6.15-ve025stab014/mm/filemap_xip.c +--- linux-2.6.15.orig/mm/filemap_xip.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/filemap_xip.c 2006-01-27 14:48:06.000000000 +0300 +@@ -190,7 +190,10 @@ __xip_unmap (struct address_space * mapp + flush_cache_page(vma, address, pte_pfn(*pte)); + pteval = ptep_clear_flush(vma, address, pte); + page_remove_rmap(page); ++ pb_remove_ref(page, mm); ++ ub_unused_privvm_inc(mm, vma); + dec_mm_counter(mm, file_rss); ++ dec_vma_rss(vma); + BUG_ON(pte_dirty(pteval)); + pte_unmap_unlock(pte, ptl); + page_cache_release(page); +diff -uprN linux-2.6.15.orig/mm/fremap.c linux-2.6.15-ve025stab014/mm/fremap.c +--- linux-2.6.15.orig/mm/fremap.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/fremap.c 2006-01-27 14:48:06.000000000 +0300 +@@ -20,6 +20,8 @@ + #include <asm/cacheflush.h> + #include <asm/tlbflush.h> + ++#include <ub/ub_vmpages.h> ++ + static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) + { +@@ -34,6 +36,7 @@ static int zap_pte(struct mm_struct *mm, + if (pte_dirty(pte)) + set_page_dirty(page); + page_remove_rmap(page); ++ pb_remove_ref(page, mm); + page_cache_release(page); + } + } else { +@@ -57,6 +60,10 @@ int install_page(struct mm_struct *mm, s + pte_t *pte; + pte_t pte_val; + spinlock_t *ptl; ++ struct page_beancounter *pbc; ++ ++ if (unlikely(pb_alloc(&pbc))) ++ goto out_nopb; + + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) +@@ -75,11 +82,15 @@ int install_page(struct mm_struct *mm, s + if (page_mapcount(page) > INT_MAX/2) + goto unlock; + +- if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) ++ if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) { ++ ub_unused_privvm_dec(mm, vma); + inc_mm_counter(mm, file_rss); ++ inc_vma_rss(vma); ++ } + + flush_icache_page(vma, page); + set_pte_at(mm, addr, pte, mk_pte(page, prot)); ++ pb_add_ref(page, mm, &pbc); + page_add_file_rmap(page); + pte_val = *pte; + update_mmu_cache(vma, addr, pte_val); +@@ -87,6 +98,8 @@ int install_page(struct mm_struct *mm, s + unlock: + pte_unmap_unlock(pte, ptl); + out: ++ pb_free(&pbc); ++out_nopb: + return err; + } + EXPORT_SYMBOL(install_page); +@@ -109,7 +122,9 @@ int install_file_pte(struct mm_struct *m + + if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) { + update_hiwater_rss(mm); ++ ub_unused_privvm_inc(mm, vma); + dec_mm_counter(mm, file_rss); ++ dec_vma_rss(vma); + } + + set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); +diff -uprN linux-2.6.15.orig/mm/memory.c linux-2.6.15-ve025stab014/mm/memory.c +--- linux-2.6.15.orig/mm/memory.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/memory.c 2006-01-27 14:48:08.000000000 +0300 +@@ -58,6 +58,8 @@ + #include <linux/swapops.h> + #include <linux/elf.h> + ++#include <ub/ub_vmpages.h> ++ + #ifndef CONFIG_NEED_MULTIPLE_NODES + /* use the per-pgdat data instead for discontigmem - mbligh */ + unsigned long max_mapnr; +@@ -418,7 +420,7 @@ struct page *vm_normal_page(struct vm_ar + static inline void + copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, +- unsigned long addr, int *rss) ++ unsigned long addr, int *rss, struct page_beancounter **pbc) + { + unsigned long vm_flags = vma->vm_flags; + pte_t pte = *src_pte; +@@ -461,6 +463,7 @@ copy_one_pte(struct mm_struct *dst_mm, s + if (page) { + get_page(page); + page_dup_rmap(page); ++ pb_add_list_ref(page, dst_mm, pbc); + rss[!!PageAnon(page)]++; + } + +@@ -468,20 +471,35 @@ out_set_pte: + set_pte_at(dst_mm, addr, dst_pte, pte); + } + ++#define pte_ptrs(a) (PTRS_PER_PTE - ((a >> PAGE_SHIFT)&(PTRS_PER_PTE - 1))) ++#ifdef CONFIG_USER_RESOURCE ++#define same_ub(mm1, mm2) ((mm1)->mm_ub == (mm2)->mm_ub) ++#else ++#define same_ub(mm1, mm2) (1) ++#endif ++ + static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +- pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, ++ pmd_t *dst_pmd, pmd_t *src_pmd, ++ struct vm_area_struct *dst_vma, ++ struct vm_area_struct *vma, + unsigned long addr, unsigned long end) + { + pte_t *src_pte, *dst_pte; + spinlock_t *src_ptl, *dst_ptl; + int progress = 0; +- int rss[2]; ++ int rss[2], rss_tot; ++ struct page_beancounter *pbc; + ++ pbc = NULL; + again: ++ if (!same_ub(src_mm, dst_mm) && ++ pb_alloc_list(&pbc, pte_ptrs(addr))) ++ goto nomem; + rss[1] = rss[0] = 0; + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); + if (!dst_pte) +- return -ENOMEM; ++ goto nomem_pb; ++ + src_pte = pte_offset_map_nested(src_pmd, addr); + src_ptl = pte_lockptr(src_mm, src_pmd); + spin_lock(src_ptl); +@@ -502,22 +520,35 @@ again: + progress++; + continue; + } +- copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); ++ copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, ++ vma, addr, rss, &pbc); + progress += 8; + } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + + spin_unlock(src_ptl); + pte_unmap_nested(src_pte - 1); ++ rss_tot = rss[0] + rss[1]; ++ add_vma_rss(dst_vma, rss_tot); ++ ub_unused_privvm_sub(dst_mm, dst_vma, rss_tot); + add_mm_rss(dst_mm, rss[0], rss[1]); + pte_unmap_unlock(dst_pte - 1, dst_ptl); + cond_resched(); + if (addr != end) + goto again; ++ ++ pb_free_list(&pbc); + return 0; ++ ++nomem_pb: ++ pb_free(&pbc); ++nomem: ++ return -ENOMEM; + } + + static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +- pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, ++ pud_t *dst_pud, pud_t *src_pud, ++ struct vm_area_struct *dst_vma, ++ struct vm_area_struct *vma, + unsigned long addr, unsigned long end) + { + pmd_t *src_pmd, *dst_pmd; +@@ -532,14 +563,16 @@ static inline int copy_pmd_range(struct + if (pmd_none_or_clear_bad(src_pmd)) + continue; + if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, +- vma, addr, next)) ++ dst_vma, vma, addr, next)) + return -ENOMEM; + } while (dst_pmd++, src_pmd++, addr = next, addr != end); + return 0; + } + + static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +- pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, ++ pgd_t *dst_pgd, pgd_t *src_pgd, ++ struct vm_area_struct *dst_vma, ++ struct vm_area_struct *vma, + unsigned long addr, unsigned long end) + { + pud_t *src_pud, *dst_pud; +@@ -554,14 +587,14 @@ static inline int copy_pud_range(struct + if (pud_none_or_clear_bad(src_pud)) + continue; + if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, +- vma, addr, next)) ++ dst_vma, vma, addr, next)) + return -ENOMEM; + } while (dst_pud++, src_pud++, addr = next, addr != end); + return 0; + } + + int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +- struct vm_area_struct *vma) ++ struct vm_area_struct *dst_vma, struct vm_area_struct *vma) + { + pgd_t *src_pgd, *dst_pgd; + unsigned long next; +@@ -589,7 +622,7 @@ int copy_page_range(struct mm_struct *ds + if (pgd_none_or_clear_bad(src_pgd)) + continue; + if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, +- vma, addr, next)) ++ dst_vma, vma, addr, next)) + return -ENOMEM; + } while (dst_pgd++, src_pgd++, addr = next, addr != end); + return 0; +@@ -605,6 +638,7 @@ static unsigned long zap_pte_range(struc + spinlock_t *ptl; + int file_rss = 0; + int anon_rss = 0; ++ int rss; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + do { +@@ -657,6 +691,7 @@ static unsigned long zap_pte_range(struc + file_rss--; + } + page_remove_rmap(page); ++ pb_remove_ref(page, mm); + tlb_remove_page(tlb, page); + continue; + } +@@ -671,6 +706,9 @@ static unsigned long zap_pte_range(struc + pte_clear_full(mm, addr, pte, tlb->fullmm); + } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); + ++ rss = -(file_rss + anon_rss); ++ ub_unused_privvm_add(mm, vma, rss); ++ sub_vma_rss(vma, rss); + add_mm_rss(mm, file_rss, anon_rss); + pte_unmap_unlock(pte - 1, ptl); + +@@ -883,8 +921,9 @@ struct page *follow_page(struct vm_area_ + pte_t *ptep, pte; + spinlock_t *ptl; + struct page *page; +- struct mm_struct *mm = vma->vm_mm; ++ struct mm_struct *mm; + ++ mm = (flags & FOLL_KERN ? &init_mm : vma->vm_mm); + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); +@@ -892,7 +931,10 @@ struct page *follow_page(struct vm_area_ + } + + page = NULL; +- pgd = pgd_offset(mm, address); ++ if (flags & FOLL_KERN) ++ pgd = pgd_offset_k(address); ++ else ++ pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto no_page_table; + +@@ -910,6 +952,9 @@ struct page *follow_page(struct vm_area_ + goto out; + } + ++ if (flags & FOLL_KERN) ++ goto kern; ++ + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!ptep) + goto out; +@@ -919,6 +964,7 @@ struct page *follow_page(struct vm_area_ + goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; ++ + page = vm_normal_page(vma, address, pte); + if (unlikely(!page)) + goto unlock; +@@ -936,6 +982,15 @@ unlock: + out: + return page; + ++kern: ++ ptep = pte_offset_map(pmd, address); ++ BUG_ON(!ptep); ++ pte = *ptep; ++ BUG_ON(!pte_present(pte)); ++ page = pte_page(pte); ++ pte_unmap(ptep); ++ return page; ++ + no_page_table: + /* + * When core dumping an enormous anonymous area that nobody +@@ -1076,12 +1131,14 @@ int get_user_pages(struct task_struct *t + } + EXPORT_SYMBOL(get_user_pages); + +-static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, ++static int zeromap_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, pgprot_t prot) + { + pte_t *pte; + spinlock_t *ptl; ++ struct mm_struct *mm; + ++ mm = vma->vm_mm; + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; +@@ -1091,6 +1148,7 @@ static int zeromap_pte_range(struct mm_s + page_cache_get(page); + page_add_file_rmap(page); + inc_mm_counter(mm, file_rss); ++ inc_vma_rss(vma); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, addr, pte, zero_pte); + } while (pte++, addr += PAGE_SIZE, addr != end); +@@ -1098,35 +1156,35 @@ static int zeromap_pte_range(struct mm_s + return 0; + } + +-static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, ++static inline int zeromap_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, pgprot_t prot) + { + pmd_t *pmd; + unsigned long next; + +- pmd = pmd_alloc(mm, pud, addr); ++ pmd = pmd_alloc(vma->vm_mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); +- if (zeromap_pte_range(mm, pmd, addr, next, prot)) ++ if (zeromap_pte_range(vma, pmd, addr, next, prot)) + return -ENOMEM; + } while (pmd++, addr = next, addr != end); + return 0; + } + +-static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, ++static inline int zeromap_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, pgprot_t prot) + { + pud_t *pud; + unsigned long next; + +- pud = pud_alloc(mm, pgd, addr); ++ pud = pud_alloc(vma->vm_mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); +- if (zeromap_pmd_range(mm, pud, addr, next, prot)) ++ if (zeromap_pmd_range(vma, pud, addr, next, prot)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; +@@ -1138,15 +1196,14 @@ int zeromap_page_range(struct vm_area_st + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + size; +- struct mm_struct *mm = vma->vm_mm; + int err; + + BUG_ON(addr >= end); +- pgd = pgd_offset(mm, addr); ++ pgd = pgd_offset(vma->vm_mm, addr); + flush_cache_range(vma, addr, end); + do { + next = pgd_addr_end(addr, end); +- err = zeromap_pud_range(mm, pgd, addr, next, prot); ++ err = zeromap_pud_range(vma, pgd, addr, next, prot); + if (err) + break; + } while (pgd++, addr = next, addr != end); +@@ -1172,11 +1229,14 @@ pte_t * fastcall get_locked_pte(struct m + * old drivers should use this, and they needed to mark their + * pages reserved for the old functions anyway. + */ +-static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot) ++static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot) + { + int retval; + pte_t *pte; +- spinlock_t *ptl; ++ spinlock_t *ptl; ++ struct mm_struct *mm; ++ ++ mm = vma->vm_mm; + + retval = -EINVAL; + if (PageAnon(page)) +@@ -1193,6 +1253,7 @@ static int insert_page(struct mm_struct + /* Ok, finally just insert the thing.. */ + get_page(page); + inc_mm_counter(mm, file_rss); ++ inc_vma_rss(vma); + page_add_file_rmap(page); + set_pte_at(mm, addr, pte, mk_pte(page, prot)); + +@@ -1229,7 +1290,7 @@ int vm_insert_page(struct vm_area_struct + if (!page_count(page)) + return -EINVAL; + vma->vm_flags |= VM_INSERTPAGE; +- return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot); ++ return insert_page(vma, addr, page, vma->vm_page_prot); + } + EXPORT_SYMBOL(vm_insert_page); + +@@ -1438,6 +1499,7 @@ static int do_wp_page(struct mm_struct * + struct page *old_page, *new_page; + pte_t entry; + int ret = VM_FAULT_MINOR; ++ struct page_beancounter *pbc; + + old_page = vm_normal_page(vma, address, orig_pte); + if (!old_page) +@@ -1465,6 +1527,9 @@ static int do_wp_page(struct mm_struct * + gotten: + pte_unmap_unlock(page_table, ptl); + ++ if (unlikely(pb_alloc(&pbc))) ++ goto oom_nopb; ++ + if (unlikely(anon_vma_prepare(vma))) + goto oom; + if (old_page == ZERO_PAGE(address)) { +@@ -1485,12 +1550,16 @@ gotten: + if (likely(pte_same(*page_table, orig_pte))) { + if (old_page) { + page_remove_rmap(old_page); ++ pb_remove_ref(old_page, mm); + if (!PageAnon(old_page)) { + dec_mm_counter(mm, file_rss); + inc_mm_counter(mm, anon_rss); + } +- } else ++ } else { ++ ub_unused_privvm_dec(mm, vma); + inc_mm_counter(mm, anon_rss); ++ inc_vma_rss(vma); ++ } + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = mk_pte(new_page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); +@@ -1499,6 +1568,7 @@ gotten: + lazy_mmu_prot_update(entry); + lru_cache_add_active(new_page); + page_add_anon_rmap(new_page, vma, address); ++ pb_add_ref(new_page, mm, &pbc); + + /* Free the old page.. */ + new_page = old_page; +@@ -1508,10 +1578,13 @@ gotten: + page_cache_release(new_page); + if (old_page) + page_cache_release(old_page); ++ pb_free(&pbc); + unlock: + pte_unmap_unlock(page_table, ptl); + return ret; + oom: ++ pb_free(&pbc); ++oom_nopb: + if (old_page) + page_cache_release(old_page); + return VM_FAULT_OOM; +@@ -1843,10 +1916,16 @@ static int do_swap_page(struct mm_struct + swp_entry_t entry; + pte_t pte; + int ret = VM_FAULT_MINOR; ++ struct page_beancounter *pbc; ++ cycles_t start; + + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) +- goto out; ++ goto out_nostat; + ++ if (unlikely(pb_alloc(&pbc))) ++ return VM_FAULT_OOM; ++ ++ start = get_cycles(); + entry = pte_to_swp_entry(orig_pte); + page = lookup_swap_cache(entry); + if (!page) { +@@ -1887,6 +1966,8 @@ static int do_swap_page(struct mm_struct + /* The page isn't present yet, go ahead with the fault. */ + + inc_mm_counter(mm, anon_rss); ++ inc_vma_rss(vma); ++ ub_swapin_inc(mm); + pte = mk_pte(page, vma->vm_page_prot); + if (write_access && can_share_swap_page(page)) { + pte = maybe_mkwrite(pte_mkdirty(pte), vma); +@@ -1896,6 +1977,8 @@ static int do_swap_page(struct mm_struct + flush_icache_page(vma, page); + set_pte_at(mm, address, page_table, pte); + page_add_anon_rmap(page, vma, address); ++ pb_add_ref(page, mm, &pbc); ++ ub_unused_privvm_dec(mm, vma); + + swap_free(entry); + if (vm_swap_full()) +@@ -1906,7 +1989,7 @@ static int do_swap_page(struct mm_struct + if (do_wp_page(mm, vma, address, + page_table, pmd, ptl, pte) == VM_FAULT_OOM) + ret = VM_FAULT_OOM; +- goto out; ++ goto out_wp; + } + + /* No need to invalidate - it was non-present before */ +@@ -1914,10 +1997,16 @@ static int do_swap_page(struct mm_struct + lazy_mmu_prot_update(pte); + unlock: + pte_unmap_unlock(page_table, ptl); +-out: ++out_wp: ++ pb_free(&pbc); ++ spin_lock_irq(&kstat_glb_lock); ++ KSTAT_LAT_ADD(&kstat_glob.swap_in, get_cycles() - start); ++ spin_unlock_irq(&kstat_glb_lock); ++out_nostat: + return ret; + out_nomap: + pte_unmap_unlock(page_table, ptl); ++ pb_free(&pbc); + unlock_page(page); + page_cache_release(page); + return ret; +@@ -1935,11 +2024,15 @@ static int do_anonymous_page(struct mm_s + struct page *page; + spinlock_t *ptl; + pte_t entry; ++ struct page_beancounter *pbc; + + if (write_access) { + /* Allocate our own private page. */ + pte_unmap(page_table); + ++ if (unlikely(pb_alloc(&pbc))) ++ goto oom_nopb; ++ + if (unlikely(anon_vma_prepare(vma))) + goto oom; + page = alloc_zeroed_user_highpage(vma, address); +@@ -1957,6 +2050,9 @@ static int do_anonymous_page(struct mm_s + SetPageReferenced(page); + page_add_anon_rmap(page, vma, address); + } else { ++ if (unlikely(__pb_alloc(&pbc, GFP_ATOMIC))) ++ goto oom_nopb_locked; ++ + /* Map the ZERO_PAGE - vm_page_prot is readonly */ + page = ZERO_PAGE(address); + page_cache_get(page); +@@ -1970,18 +2066,28 @@ static int do_anonymous_page(struct mm_s + page_add_file_rmap(page); + } + ++ inc_vma_rss(vma); ++ pb_add_ref(page, mm, &pbc); ++ ub_unused_privvm_dec(mm, vma); + set_pte_at(mm, address, page_table, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + unlock: ++ pb_free(&pbc); + pte_unmap_unlock(page_table, ptl); + return VM_FAULT_MINOR; + release: + page_cache_release(page); + goto unlock; + oom: ++ pb_free(&pbc); ++oom_nopb: ++ return VM_FAULT_OOM; ++ ++oom_nopb_locked: ++ pte_unmap_unlock(page_table, ptl); + return VM_FAULT_OOM; + } + +@@ -2009,6 +2115,7 @@ static int do_no_page(struct mm_struct * + unsigned int sequence = 0; + int ret = VM_FAULT_MINOR; + int anon = 0; ++ struct page_beancounter *pbc; + + pte_unmap(page_table); + BUG_ON(vma->vm_flags & VM_PFNMAP); +@@ -2018,6 +2125,9 @@ static int do_no_page(struct mm_struct * + sequence = mapping->truncate_count; + smp_rmb(); /* serializes i_size against truncate_count */ + } ++ ++ if (unlikely(pb_alloc(&pbc))) ++ goto oom_nopb; + retry: + new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); + /* +@@ -2030,9 +2140,9 @@ retry: + + /* no page was available -- either SIGBUS or OOM */ + if (new_page == NOPAGE_SIGBUS) +- return VM_FAULT_SIGBUS; ++ goto bus_nopg; + if (new_page == NOPAGE_OOM) +- return VM_FAULT_OOM; ++ goto oom_nopg; + + /* + * Should we do an early C-O-W break? +@@ -2091,6 +2201,9 @@ retry: + inc_mm_counter(mm, file_rss); + page_add_file_rmap(new_page); + } ++ inc_vma_rss(vma); ++ pb_add_ref(new_page, mm, &pbc); ++ ub_unused_privvm_dec(mm, vma); + } else { + /* One of our sibling threads was faster, back out. */ + page_cache_release(new_page); +@@ -2102,10 +2215,18 @@ retry: + lazy_mmu_prot_update(entry); + unlock: + pte_unmap_unlock(page_table, ptl); ++ pb_free(&pbc); + return ret; + oom: + page_cache_release(new_page); ++oom_nopg: ++ pb_free(&pbc); ++oom_nopb: + return VM_FAULT_OOM; ++ ++bus_nopg: ++ pb_free(&pbc); ++ return VM_FAULT_SIGBUS; + } + + /* +diff -uprN linux-2.6.15.orig/mm/mempool.c linux-2.6.15-ve025stab014/mm/mempool.c +--- linux-2.6.15.orig/mm/mempool.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/mempool.c 2006-01-27 14:48:06.000000000 +0300 +@@ -14,6 +14,7 @@ + #include <linux/mempool.h> + #include <linux/blkdev.h> + #include <linux/writeback.h> ++#include <linux/kmem_cache.h> + + static void add_element(mempool_t *pool, void *element) + { +@@ -78,6 +79,8 @@ mempool_t *mempool_create_node(int min_n + init_waitqueue_head(&pool->wait); + pool->alloc = alloc_fn; + pool->free = free_fn; ++ if (alloc_fn == mempool_alloc_slab) ++ kmem_mark_nocharge((kmem_cache_t *)pool_data); + + /* + * First pre-allocate the guaranteed number of buffers. +@@ -119,6 +122,7 @@ int mempool_resize(mempool_t *pool, int + unsigned long flags; + + BUG_ON(new_min_nr <= 0); ++ gfp_mask &= ~__GFP_UBC; + + spin_lock_irqsave(&pool->lock, flags); + if (new_min_nr <= pool->min_nr) { +@@ -212,6 +216,7 @@ void * mempool_alloc(mempool_t *pool, gf + gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ + gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ + gfp_mask |= __GFP_NOWARN; /* failures are OK */ ++ gfp_mask &= ~__GFP_UBC; + + gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO); + +diff -uprN linux-2.6.15.orig/mm/mlock.c linux-2.6.15-ve025stab014/mm/mlock.c +--- linux-2.6.15.orig/mm/mlock.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/mlock.c 2006-01-27 14:48:06.000000000 +0300 +@@ -10,6 +10,7 @@ + #include <linux/mempolicy.h> + #include <linux/syscalls.h> + ++#include <ub/ub_vmpages.h> + + static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end, unsigned int newflags) +@@ -24,6 +25,12 @@ static int mlock_fixup(struct vm_area_st + goto out; + } + ++ if (newflags & VM_LOCKED) { ++ ret = ub_locked_charge(mm, end - start); ++ if (ret < 0) ++ goto out; ++ } ++ + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, + vma->vm_file, pgoff, vma_policy(vma)); +@@ -62,7 +69,8 @@ success: + pages = -pages; + if (!(newflags & VM_IO)) + ret = make_pages_present(start, end); +- } ++ } else ++ ub_locked_uncharge(mm, end - start); + + vma->vm_mm->locked_vm -= pages; + out: +diff -uprN linux-2.6.15.orig/mm/mmap.c linux-2.6.15-ve025stab014/mm/mmap.c +--- linux-2.6.15.orig/mm/mmap.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/mmap.c 2006-01-27 14:48:06.000000000 +0300 +@@ -24,14 +24,18 @@ + #include <linux/mount.h> + #include <linux/mempolicy.h> + #include <linux/rmap.h> ++#include <linux/virtinfo.h> + + #include <asm/uaccess.h> + #include <asm/cacheflush.h> + #include <asm/tlb.h> + ++#include <ub/ub_vmpages.h> ++ + static void unmap_region(struct mm_struct *mm, + struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long start, unsigned long end); ++static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft); + + /* + * WARNING: the debugging will use recursive algorithms so never enable this +@@ -86,6 +90,16 @@ int __vm_enough_memory(long pages, int c + + vm_acct_memory(pages); + ++ switch (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_ENOUGHMEM, ++ (void *)pages) ++ & (NOTIFY_OK | NOTIFY_FAIL)) { ++ case NOTIFY_OK: ++ return 0; ++ case NOTIFY_FAIL: ++ vm_unacct_memory(pages); ++ return -ENOMEM; ++ } ++ + /* + * Sometimes we want to use more memory than we have + */ +@@ -200,11 +214,16 @@ static struct vm_area_struct *remove_vma + struct vm_area_struct *next = vma->vm_next; + + might_sleep(); ++ ++ ub_memory_uncharge(vma->vm_mm, vma->vm_end - vma->vm_start, ++ vma->vm_flags, vma->vm_file); + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (vma->vm_file) + fput(vma->vm_file); + mpol_free(vma_policy(vma)); ++ if (get_vma_rss(vma)) ++ warn_bad_rss(vma, 0); + kmem_cache_free(vm_area_cachep, vma); + return next; + } +@@ -241,7 +260,7 @@ asmlinkage unsigned long sys_brk(unsigne + goto out; + + /* Ok, looks good - let it rip. */ +- if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) ++ if (__do_brk(oldbrk, newbrk-oldbrk, UB_HARD) != oldbrk) + goto out; + set_brk: + mm->brk = brk; +@@ -725,7 +744,7 @@ struct vm_area_struct *vma_merge(struct + else + next = mm->mmap; + area = next; +- if (next && next->vm_end == end) /* cases 6, 7, 8 */ ++ if (next && next->vm_end == end) /* cases 6, 7, 8 */ + next = next->vm_next; + + /* +@@ -745,11 +764,22 @@ struct vm_area_struct *vma_merge(struct + is_mergeable_anon_vma(prev->anon_vma, + next->anon_vma)) { + /* cases 1, 6 */ ++ add_vma_rss(prev, get_vma_rss(next)); ++ if (area != next) /* case 6 */ ++ add_vma_rss(prev, get_vma_rss(area)); + vma_adjust(prev, prev->vm_start, + next->vm_end, prev->vm_pgoff, NULL); +- } else /* cases 2, 5, 7 */ ++ } else { /* cases 2, 5, 7 */ ++ if (next && addr == next->vm_start) { /* case 5 */ ++ unsigned long rss; ++ rss = pages_in_vma_range(next, addr, end); ++ sub_vma_rss(next, rss); ++ add_vma_rss(prev, rss); ++ } else if (area != next) /* case 7 */ ++ add_vma_rss(prev, get_vma_rss(area)); + vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL); ++ } + return prev; + } + +@@ -760,12 +790,19 @@ struct vm_area_struct *vma_merge(struct + mpol_equal(policy, vma_policy(next)) && + can_vma_merge_before(next, vm_flags, + anon_vma, file, pgoff+pglen)) { +- if (prev && addr < prev->vm_end) /* case 4 */ ++ if (prev && addr < prev->vm_end) { /* case 4 */ ++ unsigned long rss; ++ rss = pages_in_vma_range(prev, addr, end); ++ sub_vma_rss(prev, rss); ++ add_vma_rss(next, rss); + vma_adjust(prev, prev->vm_start, + addr, prev->vm_pgoff, NULL); +- else /* cases 3, 8 */ ++ } else { /* cases 3, 8 */ ++ if (area != next) /* case 8 */ ++ add_vma_rss(area, get_vma_rss(next)); + vma_adjust(area, addr, next->vm_end, + next->vm_pgoff - pglen, NULL); ++ } + return area; + } + +@@ -1032,6 +1069,10 @@ munmap_back: + } + } + ++ if (ub_memory_charge(mm, len, vm_flags, file, ++ (flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD))) ++ goto charge_error; ++ + /* + * Can we just expand an old private anonymous mapping? + * The VM_SHARED test is necessary because shmem_zero_setup +@@ -1047,7 +1088,8 @@ munmap_back: + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ +- vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); ++ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL | ++ (flags & MAP_EXECPRIO ? __GFP_SOFT_UBC : 0)); + if (!vma) { + error = -ENOMEM; + goto unacct_error; +@@ -1141,6 +1183,8 @@ unmap_and_free_vma: + free_vma: + kmem_cache_free(vm_area_cachep, vma); + unacct_error: ++ ub_memory_uncharge(mm, len, vm_flags, file); ++charge_error: + if (charged) + vm_unacct_memory(charged); + return error; +@@ -1470,12 +1514,16 @@ static int acct_stack_growth(struct vm_a + return -ENOMEM; + } + ++ if (ub_memory_charge(mm, grow << PAGE_SHIFT, vma->vm_flags, ++ vma->vm_file, UB_SOFT)) ++ goto fail_charge; ++ + /* + * Overcommit.. This must be the final test, as it will + * update security statistics. + */ + if (security_vm_enough_memory(grow)) +- return -ENOMEM; ++ goto fail_sec; + + /* Ok, everything looks good - let it rip */ + mm->total_vm += grow; +@@ -1483,6 +1531,11 @@ static int acct_stack_growth(struct vm_a + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); + return 0; ++ ++fail_sec: ++ ub_memory_uncharge(mm, grow << PAGE_SHIFT, vma->vm_flags, vma->vm_file); ++fail_charge: ++ return -ENOMEM; + } + + #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) +@@ -1743,6 +1796,10 @@ int split_vma(struct mm_struct * mm, str + else + vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + ++ /* protected with mmap sem */ ++ set_vma_rss(vma, pages_in_vma(vma)); ++ set_vma_rss(new, pages_in_vma(new)); ++ + return 0; + } + +@@ -1838,7 +1895,7 @@ static inline void verify_mm_writelocked + * anonymous maps. eventually we may be able to do some + * brk-specific accounting here. + */ +-unsigned long do_brk(unsigned long addr, unsigned long len) ++static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft) + { + struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; +@@ -1890,11 +1947,14 @@ unsigned long do_brk(unsigned long addr, + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + +- if (security_vm_enough_memory(len >> PAGE_SHIFT)) +- return -ENOMEM; +- + flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + ++ if (ub_memory_charge(mm, len, flags, NULL, soft)) ++ goto fail_charge; ++ ++ if (security_vm_enough_memory(len >> PAGE_SHIFT)) ++ goto fail_sec; ++ + /* Can we just expand an old private anonymous mapping? */ + if (vma_merge(mm, prev, addr, addr + len, flags, + NULL, NULL, pgoff, NULL)) +@@ -1903,11 +1963,11 @@ unsigned long do_brk(unsigned long addr, + /* + * create a vma struct for an anonymous mapping + */ +- vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); +- if (!vma) { +- vm_unacct_memory(len >> PAGE_SHIFT); +- return -ENOMEM; +- } ++ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL | ++ (soft == UB_SOFT ? __GFP_SOFT_UBC : 0)); ++ if (!vma) ++ goto fail_alloc; ++ + memset(vma, 0, sizeof(*vma)); + + vma->vm_mm = mm; +@@ -1924,8 +1984,19 @@ out: + make_pages_present(addr, addr + len); + } + return addr; ++ ++fail_alloc: ++ vm_unacct_memory(len >> PAGE_SHIFT); ++fail_sec: ++ ub_memory_uncharge(mm, len, flags, NULL); ++fail_charge: ++ return -ENOMEM; + } + ++unsigned long do_brk(unsigned long addr, unsigned long len) ++{ ++ return __do_brk(addr, len, UB_SOFT); ++} + EXPORT_SYMBOL(do_brk); + + /* Release all mmaps. */ +@@ -2035,6 +2106,7 @@ struct vm_area_struct *copy_vma(struct v + new_vma->vm_start = addr; + new_vma->vm_end = addr + len; + new_vma->vm_pgoff = pgoff; ++ set_vma_rss(new_vma, 0); + if (new_vma->vm_file) + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) +diff -uprN linux-2.6.15.orig/mm/mprotect.c linux-2.6.15-ve025stab014/mm/mprotect.c +--- linux-2.6.15.orig/mm/mprotect.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/mprotect.c 2006-01-27 14:48:06.000000000 +0300 +@@ -25,6 +25,8 @@ + #include <asm/cacheflush.h> + #include <asm/tlbflush.h> + ++#include <ub/ub_vmpages.h> ++ + static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, pgprot_t newprot) + { +@@ -109,12 +111,20 @@ mprotect_fixup(struct vm_area_struct *vm + pgprot_t newprot; + pgoff_t pgoff; + int error; ++ unsigned long ch_size; ++ int ch_dir; + + if (newflags == oldflags) { + *pprev = vma; + return 0; + } + ++ error = -ENOMEM; ++ ch_size = nrpages - pages_in_vma_range(vma, start, end); ++ ch_dir = ub_protected_charge(mm, ch_size, newflags, vma); ++ if (ch_dir == PRIVVM_ERROR) ++ goto fail_ch; ++ + /* + * If we make a private mapping writable we increase our commit; + * but (without finer accounting) cannot reduce our commit if we +@@ -127,7 +137,7 @@ mprotect_fixup(struct vm_area_struct *vm + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { + charged = nrpages; + if (security_vm_enough_memory(charged)) +- return -ENOMEM; ++ goto fail_sec; + newflags |= VM_ACCOUNT; + } + } +@@ -169,10 +179,16 @@ success: + change_protection(vma, start, end, newprot); + vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); + vm_stat_account(mm, newflags, vma->vm_file, nrpages); ++ if (ch_dir == PRIVVM_TO_SHARED) ++ __ub_unused_privvm_dec(mm, ch_size); + return 0; + + fail: + vm_unacct_memory(charged); ++fail_sec: ++ if (ch_dir == PRIVVM_TO_PRIVATE) ++ __ub_unused_privvm_dec(mm, ch_size); ++fail_ch: + return error; + } + +diff -uprN linux-2.6.15.orig/mm/mremap.c linux-2.6.15-ve025stab014/mm/mremap.c +--- linux-2.6.15.orig/mm/mremap.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/mremap.c 2006-01-27 14:48:06.000000000 +0300 +@@ -22,6 +22,8 @@ + #include <asm/cacheflush.h> + #include <asm/tlbflush.h> + ++#include <ub/ub_vmpages.h> ++ + static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) + { + pgd_t *pgd; +@@ -105,6 +107,8 @@ static void move_ptes(struct vm_area_str + pte = ptep_clear_flush(vma, old_addr, old_pte); + /* ZERO_PAGE can be dependant on virtual addr */ + pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); ++ dec_vma_rss(vma); ++ inc_vma_rss(new_vma); + set_pte_at(mm, new_addr, new_pte, pte); + } + +@@ -165,17 +169,21 @@ static unsigned long move_vma(struct vm_ + unsigned long hiwater_vm; + int split = 0; + ++ if (ub_memory_charge(mm, new_len, vm_flags, ++ vma->vm_file, UB_HARD)) ++ goto err; ++ + /* + * We'd prefer to avoid failure later on in do_munmap: + * which may split one vma into three before unmapping. + */ + if (mm->map_count >= sysctl_max_map_count - 3) +- return -ENOMEM; ++ goto err_nomem; + + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); + new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); + if (!new_vma) +- return -ENOMEM; ++ goto err_nomem; + + moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); + if (moved_len < old_len) { +@@ -234,7 +242,13 @@ static unsigned long move_vma(struct vm_ + new_addr + new_len); + } + +- return new_addr; ++ if (new_addr != -ENOMEM) ++ return new_addr; ++ ++err_nomem: ++ ub_memory_uncharge(mm, new_len, vm_flags, vma->vm_file); ++err: ++ return -ENOMEM; + } + + /* +@@ -360,6 +374,11 @@ unsigned long do_mremap(unsigned long ad + if (max_addr - addr >= new_len) { + int pages = (new_len - old_len) >> PAGE_SHIFT; + ++ ret = -ENOMEM; ++ if (ub_memory_charge(mm, new_len, vma->vm_flags, ++ vma->vm_file, UB_HARD)) ++ goto out; ++ + vma_adjust(vma, vma->vm_start, + addr + new_len, vma->vm_pgoff, NULL); + +diff -uprN linux-2.6.15.orig/mm/oom_kill.c linux-2.6.15-ve025stab014/mm/oom_kill.c +--- linux-2.6.15.orig/mm/oom_kill.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/oom_kill.c 2006-01-27 14:48:08.000000000 +0300 +@@ -22,6 +22,9 @@ + #include <linux/jiffies.h> + #include <linux/cpuset.h> + ++#include <ub/beancounter.h> ++#include <ub/ub_oom.h> ++ + /* #define DEBUG */ + + /** +@@ -48,6 +51,9 @@ unsigned long badness(struct task_struct + unsigned long points, cpu_time, run_time, s; + struct list_head *tsk; + ++ if (p->flags & PF_SWAPOFF) ++ return ULONG_MAX; ++ + if (!p->mm) + return 0; + +@@ -117,14 +123,19 @@ unsigned long badness(struct task_struct + * Adjust the score by oomkilladj. + */ + if (p->oomkilladj) { +- if (p->oomkilladj > 0) +- points <<= p->oomkilladj; +- else ++ if (p->oomkilladj > 0) { ++ unsigned long long points_long; ++ points_long = ++ (unsigned long long)points << p->oomkilladj; ++ points = ULONG_MAX; ++ if (points_long < ULONG_MAX) ++ points = points_long; ++ } else + points >>= -(p->oomkilladj); + } + + #ifdef DEBUG +- printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n", ++ printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n", + p->pid, p->comm, points); + #endif + return points; +@@ -141,39 +152,60 @@ static struct task_struct * select_bad_p + unsigned long maxpoints = 0; + struct task_struct *g, *p; + struct task_struct *chosen = NULL; ++ int chosen_oomadj = OOM_DISABLE; + struct timespec uptime; ++ struct user_beancounter *ub; + ++retry: ++ ub = ub_oom_select_worst(); + do_posix_clock_monotonic_gettime(&uptime); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + unsigned long points; + int releasing; + + /* skip the init task with pid == 1 */ + if (p->pid == 1) + continue; +- if (p->oomkilladj == OOM_DISABLE) +- continue; + /* If p's nodes don't overlap ours, it won't help to kill p. */ + if (!cpuset_excl_nodes_overlap(p)) + continue; ++ if (!ub_oom_task_match(p, ub)) ++ continue; + +- /* +- * This is in the process of releasing memory so for wait it +- * to finish before killing some other task by mistake. +- */ + releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || + p->flags & PF_EXITING; +- if (releasing && !(p->flags & PF_DEAD)) +- return ERR_PTR(-1UL); +- if (p->flags & PF_SWAPOFF) +- return p; ++ /* Skip the process, its killing will not help */ ++ if (releasing) ++ continue; + + points = badness(p, uptime.tv_sec); ++ if (p->oomkilladj == OOM_DISABLE && ++ chosen_oomadj != OOM_DISABLE) ++ continue; ++ if (points == ULONG_MAX) { ++ /* There is no better choice. Let's kill */ ++ chosen = p; ++ goto done; ++ } + if (points > maxpoints || !chosen) { + chosen = p; ++ chosen_oomadj = p->oomkilladj; + maxpoints = points; + } +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); ++ ++ /* Found nothing?!?! Either we hang forever, or we panic. */ ++ if (!chosen) { ++ if (ub_oom_panic(ub)) { ++ read_unlock(&tasklist_lock); ++ panic("Out of memory and no killable processes...\n"); ++ } ++ ++ put_beancounter(ub); ++ goto retry; ++ } ++done: ++ put_beancounter(ub); + return chosen; + } + +@@ -182,7 +214,7 @@ static struct task_struct * select_bad_p + * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that + * we select a process with CAP_SYS_RAW_IO set). + */ +-static void __oom_kill_task(task_t *p) ++static void __oom_kill_task(task_t *p, struct mm_struct *mm) + { + if (p->pid == 1) { + WARN_ON(1); +@@ -208,7 +240,7 @@ static void __oom_kill_task(task_t *p) + */ + p->time_slice = HZ; + set_tsk_thread_flag(p, TIF_MEMDIE); +- ++ ub_oom_kill_task(p, mm); + force_sig(SIGKILL, p); + } + +@@ -224,15 +256,15 @@ static struct mm_struct *oom_kill_task(t + return NULL; + } + +- __oom_kill_task(p); ++ __oom_kill_task(p, mm); + /* + * kill all processes that share the ->mm (i.e. all threads), + * but are in a different thread group + */ +- do_each_thread(g, q) ++ do_each_thread_all(g, q) { + if (q->mm == mm && q->tgid != p->tgid) +- __oom_kill_task(q); +- while_each_thread(g, q); ++ __oom_kill_task(q, mm); ++ } while_each_thread_all(g, q); + + return mm; + } +@@ -268,6 +300,9 @@ void out_of_memory(gfp_t gfp_mask, int o + struct mm_struct *mm = NULL; + task_t * p; + ++ if (ub_oom_start()) ++ return; ++ + if (printk_ratelimit()) { + printk("oom-killer: gfp_mask=0x%x, order=%d\n", + gfp_mask, order); +@@ -277,28 +312,23 @@ void out_of_memory(gfp_t gfp_mask, int o + read_lock(&tasklist_lock); + retry: + p = select_bad_process(); +- + if (PTR_ERR(p) == -1UL) + goto out; + +- /* Found nothing?!?! Either we hang forever, or we panic. */ +- if (!p) { +- read_unlock(&tasklist_lock); +- panic("Out of memory and no killable processes...\n"); +- } +- + mm = oom_kill_process(p); + if (!mm) + goto retry; + + out: + read_unlock(&tasklist_lock); ++ ub_oom_stop(); + if (mm) + mmput(mm); + + /* + * Give "p" a good chance of killing itself before we +- * retry to allocate memory. ++ * retry to allocate memory or exit in case of suicide. + */ +- schedule_timeout_interruptible(1); ++ if (!test_thread_flag(TIF_MEMDIE)) ++ schedule_timeout_interruptible(1); + } +diff -uprN linux-2.6.15.orig/mm/page_alloc.c linux-2.6.15-ve025stab014/mm/page_alloc.c +--- linux-2.6.15.orig/mm/page_alloc.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/page_alloc.c 2006-01-27 14:48:08.000000000 +0300 +@@ -40,6 +40,8 @@ + #include <asm/tlbflush.h> + #include "internal.h" + ++#include <ub/ub_mem.h> ++ + /* + * MCD - HACK: Find somewhere to initialize this EARLY, or make this + * initializer cleaner +@@ -49,6 +51,7 @@ EXPORT_SYMBOL(node_online_map); + nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; + EXPORT_SYMBOL(node_possible_map); + struct pglist_data *pgdat_list __read_mostly; ++EXPORT_SYMBOL(pgdat_list); + unsigned long totalram_pages __read_mostly; + unsigned long totalhigh_pages __read_mostly; + long nr_swap_pages; +@@ -415,6 +418,7 @@ void __free_pages_ok(struct page *page, + list_add(&page->lru, &list); + mod_page_state(pgfree, 1 << order); + kernel_map_pages(page, 1<<order, 0); ++ ub_page_uncharge(page, order); + free_pages_bulk(page_zone(page), 1, &list, order); + } + +@@ -699,6 +703,7 @@ static void fastcall free_hot_cold_page( + pcp->count++; + if (pcp->count >= pcp->high) + pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); ++ ub_page_uncharge(page, 0); + local_irq_restore(flags); + put_cpu(); + } +@@ -854,6 +859,26 @@ get_page_from_freelist(gfp_t gfp_mask, u + return page; + } + ++static void __alloc_collect_stats(unsigned int gfp_mask, ++ unsigned int order, struct page *page, cycles_t time) ++{ ++ int ind; ++ unsigned long flags; ++ ++ time = get_cycles() - time; ++ if (!(gfp_mask & __GFP_WAIT)) ++ ind = 0; ++ else if (!(gfp_mask & __GFP_HIGHMEM)) ++ ind = (order > 0 ? 2 : 1); ++ else ++ ind = (order > 0 ? 4 : 3); ++ spin_lock_irqsave(&kstat_glb_lock, flags); ++ KSTAT_LAT_ADD(&kstat_glob.alloc_lat[ind], time); ++ if (!page) ++ kstat_glob.alloc_fails[ind]++; ++ spin_unlock_irqrestore(&kstat_glb_lock, flags); ++} ++ + /* + * This is the 'heart' of the zoned buddy allocator. + */ +@@ -869,6 +894,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned i + int do_retry; + int alloc_flags; + int did_some_progress; ++ cycles_t start; + + might_sleep_if(wait); + +@@ -880,6 +906,7 @@ restart: + return NULL; + } + ++ start = get_cycles(); + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, + zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); + if (page) +@@ -997,6 +1024,7 @@ rebalance: + } + + nopage: ++ __alloc_collect_stats(gfp_mask, order, page, start); + if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { + printk(KERN_WARNING "%s: page allocation failure." + " order:%d, mode:0x%x\n", +@@ -1004,7 +1032,13 @@ nopage: + dump_stack(); + show_mem(); + } ++ return NULL; ++ + got_pg: ++ if (ub_page_charge(page, order, gfp_mask)) { ++ __free_pages(page, order); ++ page = NULL; ++ } + return page; + } + +@@ -2304,7 +2338,10 @@ static void *vmstat_start(struct seq_fil + m->private = ps; + if (!ps) + return ERR_PTR(-ENOMEM); +- get_full_page_state(ps); ++ if (ve_is_super(get_exec_env())) ++ get_full_page_state(ps); ++ else ++ memset(ps, 0, sizeof(*ps)); + ps->pgpgin /= 2; /* sectors -> kbytes */ + ps->pgpgout /= 2; + return (unsigned long *)ps + *pos; +diff -uprN linux-2.6.15.orig/mm/rmap.c linux-2.6.15-ve025stab014/mm/rmap.c +--- linux-2.6.15.orig/mm/rmap.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/rmap.c 2006-01-27 14:48:06.000000000 +0300 +@@ -55,6 +55,8 @@ + + #include <asm/tlbflush.h> + ++#include <ub/ub_vmpages.h> ++ + //#define RMAP_DEBUG /* can be enabled only for debugging */ + + kmem_cache_t *anon_vma_cachep; +@@ -179,7 +181,8 @@ static void anon_vma_ctor(void *data, km + void __init anon_vma_init(void) + { + anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), +- 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL); ++ 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_UBC, ++ anon_vma_ctor, NULL); + } + + /* +@@ -562,7 +565,11 @@ static int try_to_unmap_one(struct page + } else + dec_mm_counter(mm, file_rss); + ++ dec_vma_rss(vma); + page_remove_rmap(page); ++ ub_unused_privvm_inc(mm, vma); ++ ub_unmap_inc(mm); ++ pb_remove_ref(page, mm); + page_cache_release(page); + + out_unmap: +@@ -653,8 +660,12 @@ static void try_to_unmap_cluster(unsigne + set_page_dirty(page); + + page_remove_rmap(page); ++ ub_unmap_inc(mm); ++ pb_remove_ref(page, mm); ++ ub_unused_privvm_inc(mm, vma); + page_cache_release(page); + dec_mm_counter(mm, file_rss); ++ dec_vma_rss(vma); + (*mapcount)--; + } + pte_unmap_unlock(pte - 1, ptl); +diff -uprN linux-2.6.15.orig/mm/shmem.c linux-2.6.15-ve025stab014/mm/shmem.c +--- linux-2.6.15.orig/mm/shmem.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/shmem.c 2006-01-27 14:48:08.000000000 +0300 +@@ -49,6 +49,8 @@ + #include <asm/div64.h> + #include <asm/pgtable.h> + ++#include <ub/ub_vmpages.h> ++ + /* This magic number is used in glibc for posix shared memory */ + #define TMPFS_MAGIC 0x01021994 + +@@ -210,7 +212,7 @@ static void shmem_free_blocks(struct ino + * + * It has to be called with the spinlock held. + */ +-static void shmem_recalc_inode(struct inode *inode) ++static void shmem_recalc_inode(struct inode *inode, long swp_freed) + { + struct shmem_inode_info *info = SHMEM_I(inode); + long freed; +@@ -220,6 +222,8 @@ static void shmem_recalc_inode(struct in + info->alloced -= freed; + shmem_unacct_blocks(info->flags, freed); + shmem_free_blocks(inode, freed); ++ if (freed > swp_freed) ++ ub_tmpfs_respages_sub(info, freed - swp_freed); + } + } + +@@ -325,6 +329,11 @@ static void shmem_swp_set(struct shmem_i + struct page *page = kmap_atomic_to_page(entry); + set_page_private(page, page_private(page) + incdec); + } ++ ++ if (incdec == 1) ++ ub_tmpfs_respages_dec(info); ++ else ++ ub_tmpfs_respages_inc(info); + } + + /* +@@ -346,6 +355,9 @@ static swp_entry_t *shmem_swp_alloc(stru + ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + return ERR_PTR(-EINVAL); + ++ if (ub_shmpages_charge(info, index - info->next_index + 1)) ++ return ERR_PTR(-ENOSPC); ++ + while (!(entry = shmem_swp_entry(info, index, &page))) { + if (sgp == SGP_READ) + return shmem_swp_map(ZERO_PAGE(0)); +@@ -366,7 +378,8 @@ static swp_entry_t *shmem_swp_alloc(stru + } + + spin_unlock(&info->lock); +- page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); ++ page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | ++ __GFP_ZERO | __GFP_UBC); + if (page) + set_page_private(page, 0); + spin_lock(&info->lock); +@@ -482,6 +495,7 @@ static void shmem_truncate(struct inode + return; + + spin_lock(&info->lock); ++ ub_shmpages_uncharge(info, info->next_index - idx); + info->flags |= SHMEM_TRUNCATE; + limit = info->next_index; + info->next_index = idx; +@@ -602,7 +616,7 @@ done2: + info->swapped -= nr_swaps_freed; + if (nr_pages_to_free) + shmem_free_blocks(inode, nr_pages_to_free); +- shmem_recalc_inode(inode); ++ shmem_recalc_inode(inode, nr_swaps_freed); + spin_unlock(&info->lock); + + /* +@@ -680,6 +694,7 @@ static void shmem_delete_inode(struct in + sbinfo->free_inodes++; + spin_unlock(&sbinfo->stat_lock); + } ++ shmi_ub_put(info); + clear_inode(inode); + } + +@@ -801,6 +816,12 @@ int shmem_unuse(swp_entry_t entry, struc + return found; + } + ++#ifdef CONFIG_USER_RESOURCE ++#define shm_get_swap_page(info) (get_swap_page((info)->shmi_ub)) ++#else ++#define shm_get_swap_page(info) (get_swap_page(NULL)) ++#endif ++ + /* + * Move the page from the page cache to the swap cache. + */ +@@ -821,12 +842,12 @@ static int shmem_writepage(struct page * + info = SHMEM_I(inode); + if (info->flags & VM_LOCKED) + goto redirty; +- swap = get_swap_page(); ++ swap = shm_get_swap_page(info); + if (!swap.val) + goto redirty; + + spin_lock(&info->lock); +- shmem_recalc_inode(inode); ++ shmem_recalc_inode(inode, 0); + if (index >= info->next_index) { + BUG_ON(!(info->flags & SHMEM_TRUNCATE)); + goto unlock; +@@ -964,7 +985,7 @@ repeat: + goto failed; + + spin_lock(&info->lock); +- shmem_recalc_inode(inode); ++ shmem_recalc_inode(inode, 0); + entry = shmem_swp_alloc(info, idx, sgp); + if (IS_ERR(entry)) { + spin_unlock(&info->lock); +@@ -1132,6 +1153,7 @@ repeat: + spin_unlock(&info->lock); + flush_dcache_page(filepage); + SetPageUptodate(filepage); ++ ub_tmpfs_respages_inc(info); + } + done: + if (*pagep != filepage) { +@@ -1233,28 +1255,6 @@ shmem_get_policy(struct vm_area_struct * + } + #endif + +-int shmem_lock(struct file *file, int lock, struct user_struct *user) +-{ +- struct inode *inode = file->f_dentry->d_inode; +- struct shmem_inode_info *info = SHMEM_I(inode); +- int retval = -ENOMEM; +- +- spin_lock(&info->lock); +- if (lock && !(info->flags & VM_LOCKED)) { +- if (!user_shm_lock(inode->i_size, user)) +- goto out_nomem; +- info->flags |= VM_LOCKED; +- } +- if (!lock && (info->flags & VM_LOCKED) && user) { +- user_shm_unlock(inode->i_size, user); +- info->flags &= ~VM_LOCKED; +- } +- retval = 0; +-out_nomem: +- spin_unlock(&info->lock); +- return retval; +-} +- + static int shmem_mmap(struct file *file, struct vm_area_struct *vma) + { + file_accessed(file); +@@ -1291,6 +1291,7 @@ shmem_get_inode(struct super_block *sb, + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + info = SHMEM_I(inode); + memset(info, 0, (char *)inode - (char *)info); ++ shmi_ub_set(info, get_exec_ub()); + spin_lock_init(&info->lock); + INIT_LIST_HEAD(&info->swaplist); + +@@ -2120,6 +2121,10 @@ static struct vm_operations_struct shmem + #endif + }; + ++int is_shmem_mapping(struct address_space *map) ++{ ++ return (map != NULL && map->a_ops == &shmem_aops); ++} + + static struct super_block *shmem_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +@@ -2133,7 +2138,13 @@ static struct file_system_type tmpfs_fs_ + .get_sb = shmem_get_sb, + .kill_sb = kill_litter_super, + }; ++EXPORT_SYMBOL(tmpfs_fs_type); ++ ++#ifdef CONFIG_VE ++#define shm_mnt (get_exec_env()->shmem_mnt) ++#else + static struct vfsmount *shm_mnt; ++#endif + + static int __init init_tmpfs(void) + { +@@ -2170,6 +2181,36 @@ out3: + } + module_init(init_tmpfs) + ++static inline int shm_charge_ahead(struct inode *inode) ++{ ++#ifdef CONFIG_USER_RESOURCE ++ struct shmem_inode_info *info = SHMEM_I(inode); ++ unsigned long idx; ++ swp_entry_t *entry; ++ ++ if (!inode->i_size) ++ return 0; ++ idx = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; ++ /* ++ * Just touch info to allocate space for entry and ++ * make all UBC checks ++ */ ++ spin_lock(&info->lock); ++ entry = shmem_swp_alloc(info, idx, SGP_CACHE); ++ if (IS_ERR(entry)) ++ goto err; ++ shmem_swp_unmap(entry); ++ spin_unlock(&info->lock); ++ return 0; ++ ++err: ++ spin_unlock(&info->lock); ++ return PTR_ERR(entry); ++#else ++ return 0; ++#endif ++} ++ + /* + * shmem_file_setup - get an unlinked file living in tmpfs + * +@@ -2217,6 +2258,10 @@ struct file *shmem_file_setup(char *name + d_instantiate(dentry, inode); + inode->i_size = size; + inode->i_nlink = 0; /* It is unlinked */ ++ error = shm_charge_ahead(inode); ++ if (error) ++ goto close_file; ++ + file->f_vfsmnt = mntget(shm_mnt); + file->f_dentry = dentry; + file->f_mapping = inode->i_mapping; +@@ -2249,6 +2294,8 @@ int shmem_zero_setup(struct vm_area_stru + + if (vma->vm_file) + fput(vma->vm_file); ++ else if (vma->vm_flags & VM_WRITE) ++ __ub_unused_privvm_dec(vma->vm_mm, size >> PAGE_SHIFT); + vma->vm_file = file; + vma->vm_ops = &shmem_vm_ops; + return 0; +diff -uprN linux-2.6.15.orig/mm/slab.c linux-2.6.15-ve025stab014/mm/slab.c +--- linux-2.6.15.orig/mm/slab.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/slab.c 2006-01-27 14:48:08.000000000 +0300 +@@ -103,33 +103,19 @@ + #include <linux/rcupdate.h> + #include <linux/string.h> + #include <linux/nodemask.h> ++#include <linux/kmem_slab.h> ++#include <linux/kmem_cache.h> + + #include <asm/uaccess.h> + #include <asm/cacheflush.h> + #include <asm/tlbflush.h> + #include <asm/page.h> + +-/* +- * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, +- * SLAB_RED_ZONE & SLAB_POISON. +- * 0 for faster, smaller code (especially in the critical paths). +- * +- * STATS - 1 to collect stats for /proc/slabinfo. +- * 0 for faster, smaller code (especially in the critical paths). +- * +- * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) +- */ +- +-#ifdef CONFIG_DEBUG_SLAB +-#define DEBUG 1 +-#define STATS 1 +-#define FORCED_DEBUG 1 +-#else +-#define DEBUG 0 +-#define STATS 0 +-#define FORCED_DEBUG 0 +-#endif ++#include <ub/ub_mem.h> + ++#define DEBUG SLAB_DEBUG ++#define STATS SLAB_STATS ++#define FORCED_DEBUG SLAB_FORCED_DEBUG + + /* Shouldn't this be in a header file somewhere? */ + #define BYTES_PER_WORD sizeof(void *) +@@ -172,140 +158,27 @@ + SLAB_NO_REAP | SLAB_CACHE_DMA | \ + SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ +- SLAB_DESTROY_BY_RCU) ++ SLAB_DESTROY_BY_RCU | SLAB_UBC | SLAB_NO_CHARGE) + #else + # define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ + SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ +- SLAB_DESTROY_BY_RCU) ++ SLAB_DESTROY_BY_RCU | SLAB_UBC | SLAB_NO_CHARGE) + #endif + +-/* +- * kmem_bufctl_t: +- * +- * Bufctl's are used for linking objs within a slab +- * linked offsets. +- * +- * This implementation relies on "struct page" for locating the cache & +- * slab an object belongs to. +- * This allows the bufctl structure to be small (one int), but limits +- * the number of objects a slab (not a cache) can contain when off-slab +- * bufctls are used. The limit is the size of the largest general cache +- * that does not use off-slab slabs. +- * For 32bit archs with 4 kB pages, is this 56. +- * This is not serious, as it is only for large objects, when it is unwise +- * to have too many per slab. +- * Note: This limit can be raised by introducing a general cache whose size +- * is less than 512 (PAGE_SIZE<<3), but greater than 256. +- */ +- +-typedef unsigned int kmem_bufctl_t; +-#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) +-#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) +-#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) +- + /* Max number of objs-per-slab for caches which use off-slab slabs. + * Needed to avoid a possible looping condition in cache_grow(). + */ + static unsigned long offslab_limit; + + /* +- * struct slab +- * +- * Manages the objs in a slab. Placed either at the beginning of mem allocated +- * for a slab, or allocated from an general cache. +- * Slabs are chained into three list: fully used, partial, fully free slabs. +- */ +-struct slab { +- struct list_head list; +- unsigned long colouroff; +- void *s_mem; /* including colour offset */ +- unsigned int inuse; /* num of objs active in slab */ +- kmem_bufctl_t free; +- unsigned short nodeid; +-}; +- +-/* +- * struct slab_rcu +- * +- * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to +- * arrange for kmem_freepages to be called via RCU. This is useful if +- * we need to approach a kernel structure obliquely, from its address +- * obtained without the usual locking. We can lock the structure to +- * stabilize it and check it's still at the given address, only if we +- * can be sure that the memory has not been meanwhile reused for some +- * other kind of object (which our subsystem's lock might corrupt). +- * +- * rcu_read_lock before reading the address, then rcu_read_unlock after +- * taking the spinlock within the structure expected at that address. +- * +- * We assume struct slab_rcu can overlay struct slab when destroying. +- */ +-struct slab_rcu { +- struct rcu_head head; +- kmem_cache_t *cachep; +- void *addr; +-}; +- +-/* +- * struct array_cache +- * +- * Purpose: +- * - LIFO ordering, to hand out cache-warm objects from _alloc +- * - reduce the number of linked list operations +- * - reduce spinlock operations +- * +- * The limit is stored in the per-cpu structure to reduce the data cache +- * footprint. +- * +- */ +-struct array_cache { +- unsigned int avail; +- unsigned int limit; +- unsigned int batchcount; +- unsigned int touched; +- spinlock_t lock; +- void *entry[0]; /* +- * Must have this definition in here for the proper +- * alignment of array_cache. Also simplifies accessing +- * the entries. +- * [0] is for gcc 2.95. It should really be []. +- */ +-}; +- +-/* bootstrap: The caches do not work without cpuarrays anymore, +- * but the cpuarrays are allocated from the generic caches... +- */ +-#define BOOT_CPUCACHE_ENTRIES 1 +-struct arraycache_init { +- struct array_cache cache; +- void * entries[BOOT_CPUCACHE_ENTRIES]; +-}; +- +-/* +- * The slab lists for all objects. +- */ +-struct kmem_list3 { +- struct list_head slabs_partial; /* partial list first, better asm code */ +- struct list_head slabs_full; +- struct list_head slabs_free; +- unsigned long free_objects; +- unsigned long next_reap; +- int free_touched; +- unsigned int free_limit; +- spinlock_t list_lock; +- struct array_cache *shared; /* shared per node */ +- struct array_cache **alien; /* on other nodes */ +-}; +- +-/* + * Need this for bootstrapping a per node allocator. + */ + #define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1) + struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; + #define CACHE_CACHE 0 +-#define SIZE_AC 1 +-#define SIZE_L3 (1 + MAX_NUMNODES) ++#define SIZE_AC 1 ++#define SIZE_L3 (1 + MAX_NUMNODES) + + /* + * This function must be completely optimized away if +@@ -362,74 +235,6 @@ static inline void kmem_list3_init(struc + MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ + } while (0) + +-/* +- * kmem_cache_t +- * +- * manages a cache. +- */ +- +-struct kmem_cache { +-/* 1) per-cpu data, touched during every alloc/free */ +- struct array_cache *array[NR_CPUS]; +- unsigned int batchcount; +- unsigned int limit; +- unsigned int shared; +- unsigned int objsize; +-/* 2) touched by every alloc & free from the backend */ +- struct kmem_list3 *nodelists[MAX_NUMNODES]; +- unsigned int flags; /* constant flags */ +- unsigned int num; /* # of objs per slab */ +- spinlock_t spinlock; +- +-/* 3) cache_grow/shrink */ +- /* order of pgs per slab (2^n) */ +- unsigned int gfporder; +- +- /* force GFP flags, e.g. GFP_DMA */ +- gfp_t gfpflags; +- +- size_t colour; /* cache colouring range */ +- unsigned int colour_off; /* colour offset */ +- unsigned int colour_next; /* cache colouring */ +- kmem_cache_t *slabp_cache; +- unsigned int slab_size; +- unsigned int dflags; /* dynamic flags */ +- +- /* constructor func */ +- void (*ctor)(void *, kmem_cache_t *, unsigned long); +- +- /* de-constructor func */ +- void (*dtor)(void *, kmem_cache_t *, unsigned long); +- +-/* 4) cache creation/removal */ +- const char *name; +- struct list_head next; +- +-/* 5) statistics */ +-#if STATS +- unsigned long num_active; +- unsigned long num_allocations; +- unsigned long high_mark; +- unsigned long grown; +- unsigned long reaped; +- unsigned long errors; +- unsigned long max_freeable; +- unsigned long node_allocs; +- unsigned long node_frees; +- atomic_t allochit; +- atomic_t allocmiss; +- atomic_t freehit; +- atomic_t freemiss; +-#endif +-#if DEBUG +- int dbghead; +- int reallen; +-#endif +-}; +- +-#define CFLGS_OFF_SLAB (0x80000000UL) +-#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) +- + #define BATCHREFILL_LIMIT 16 + /* Optimization question: fewer reaps means less + * probability for unnessary cpucache drain/refill cycles. +@@ -565,30 +370,6 @@ static void **dbg_userword(kmem_cache_t + #define BREAK_GFP_ORDER_LO 0 + static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; + +-/* Functions for storing/retrieving the cachep and or slab from the +- * global 'mem_map'. These are used to find the slab an obj belongs to. +- * With kfree(), these are used to find the cache which an obj belongs to. +- */ +-static inline void page_set_cache(struct page *page, struct kmem_cache *cache) +-{ +- page->lru.next = (struct list_head *)cache; +-} +- +-static inline struct kmem_cache *page_get_cache(struct page *page) +-{ +- return (struct kmem_cache *)page->lru.next; +-} +- +-static inline void page_set_slab(struct page *page, struct slab *slab) +-{ +- page->lru.prev = (struct list_head *)slab; +-} +- +-static inline struct slab *page_get_slab(struct page *page) +-{ +- return (struct slab *)page->lru.prev; +-} +- + /* These are the default caches for kmalloc. Custom caches can have other sizes. */ + struct cache_sizes malloc_sizes[] = { + #define CACHE(x) { .cs_size = (x) }, +@@ -701,15 +482,25 @@ static void cache_estimate(unsigned long + { + int i; + size_t wastage = PAGE_SIZE<<gfporder; +- size_t extra = 0; +- size_t base = 0; ++ size_t extra; ++ size_t base; ++ size_t ub_align, ub_extra; + + if (!(flags & CFLGS_OFF_SLAB)) { + base = sizeof(struct slab); + extra = sizeof(kmem_bufctl_t); ++ ub_align = UB_ALIGN(flags); ++ ub_extra = UB_EXTRA(flags); ++ } else { ++ base = 0; ++ extra = 0; ++ ub_align = UB_ALIGN(0); ++ ub_extra = UB_EXTRA(0); + } ++ + i = 0; +- while (i*size + ALIGN(base+i*extra, align) <= wastage) ++ while (i * size + ALIGN(ALIGN(base + i * extra, ub_align) + ++ i * ub_extra, align) <= wastage) + i++; + if (i > 0) + i--; +@@ -718,8 +509,8 @@ static void cache_estimate(unsigned long + i = SLAB_LIMIT; + + *num = i; +- wastage -= i*size; +- wastage -= ALIGN(base+i*extra, align); ++ wastage -= i * size + ALIGN(ALIGN(base + i * extra, ub_align) ++ + i * ub_extra, align); + *left_over = wastage; + } + +@@ -1075,13 +866,15 @@ void __init kmem_cache_init(void) + + sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, + sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN, +- (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); ++ (ARCH_KMALLOC_FLAGS | SLAB_PANIC | ++ SLAB_UBC | SLAB_NO_CHARGE), NULL, NULL); + + if (INDEX_AC != INDEX_L3) + sizes[INDEX_L3].cs_cachep = + kmem_cache_create(names[INDEX_L3].name, + sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN, +- (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); ++ (ARCH_KMALLOC_FLAGS | SLAB_PANIC | ++ SLAB_UBC | SLAB_NO_CHARGE), NULL, NULL); + + while (sizes->cs_size != ULONG_MAX) { + /* +@@ -1094,13 +887,12 @@ void __init kmem_cache_init(void) + if(!sizes->cs_cachep) + sizes->cs_cachep = kmem_cache_create(names->name, + sizes->cs_size, ARCH_KMALLOC_MINALIGN, +- (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); ++ (ARCH_KMALLOC_FLAGS | SLAB_PANIC | ++ SLAB_UBC | SLAB_NO_CHARGE), NULL, NULL); + + /* Inc off-slab bufctl limit until the ceiling is hit. */ +- if (!(OFF_SLAB(sizes->cs_cachep))) { +- offslab_limit = sizes->cs_size-sizeof(struct slab); +- offslab_limit /= sizeof(kmem_bufctl_t); +- } ++ if (!(OFF_SLAB(sizes->cs_cachep))) ++ offslab_limit = sizes->cs_size; + + sizes->cs_dmacachep = kmem_cache_create(names->name_dma, + sizes->cs_size, ARCH_KMALLOC_MINALIGN, +@@ -1511,7 +1303,7 @@ kmem_cache_create (const char *name, siz + unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), + void (*dtor)(void*, kmem_cache_t *, unsigned long)) + { +- size_t left_over, slab_size, ralign; ++ size_t left_over, slab_size, ralign, ub_align, ub_extra; + kmem_cache_t *cachep = NULL; + struct list_head *p; + +@@ -1672,6 +1464,8 @@ kmem_cache_create (const char *name, siz + flags |= CFLGS_OFF_SLAB; + + size = ALIGN(size, align); ++ ub_align = UB_ALIGN(flags); ++ ub_extra = UB_EXTRA(flags); + + if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) { + /* +@@ -1692,6 +1486,8 @@ kmem_cache_create (const char *name, siz + */ + do { + unsigned int break_flag = 0; ++ unsigned long off_slab_size; ++ + cal_wastage: + cache_estimate(cachep->gfporder, size, align, flags, + &left_over, &cachep->num); +@@ -1701,12 +1497,17 @@ cal_wastage: + break; + if (!cachep->num) + goto next; +- if (flags & CFLGS_OFF_SLAB && +- cachep->num > offslab_limit) { ++ if (flags & CFLGS_OFF_SLAB) { ++ off_slab_size = sizeof(struct slab) + ++ cachep->num * sizeof(kmem_bufctl_t); ++ off_slab_size = ALIGN(off_slab_size, ub_align) + ++ cachep->num * ub_extra; + /* This num of objs will cause problems. */ +- cachep->gfporder--; +- break_flag++; +- goto cal_wastage; ++ if (off_slab_size > offslab_limit) { ++ cachep->gfporder--; ++ break_flag++; ++ goto cal_wastage; ++ } + } + + /* +@@ -1729,8 +1530,9 @@ next: + cachep = NULL; + goto oops; + } +- slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t) +- + sizeof(struct slab), align); ++ slab_size = ALIGN(ALIGN(cachep->num * sizeof(kmem_bufctl_t) + ++ sizeof(struct slab), ub_align) + ++ cachep->num * ub_extra, align); + + /* + * If the slab has been placed off-slab, and we have enough space then +@@ -1743,7 +1545,9 @@ next: + + if (flags & CFLGS_OFF_SLAB) { + /* really off slab. No need for manual alignment */ +- slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab); ++ slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) + ++ sizeof(struct slab), ub_align) + ++ cachep->num * ub_extra; + } + + cachep->colour_off = cache_line_size(); +@@ -1825,6 +1629,7 @@ next: + /* cache setup completed, link it into the list */ + list_add(&cachep->next, &cache_chain); + unlock_cpu_hotplug(); ++ set_cache_objuse(cachep); + oops: + if (!cachep && (flags & SLAB_PANIC)) + panic("kmem_cache_create(): failed to create slab `%s'\n", +@@ -2064,7 +1869,8 @@ static struct slab* alloc_slabmgmt(kmem_ + + if (OFF_SLAB(cachep)) { + /* Slab management obj is off-slab. */ +- slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); ++ slabp = kmem_cache_alloc(cachep->slabp_cache, ++ local_flags & (~__GFP_UBC)); + if (!slabp) + return NULL; + } else { +@@ -2074,15 +1880,11 @@ static struct slab* alloc_slabmgmt(kmem_ + slabp->inuse = 0; + slabp->colouroff = colour_off; + slabp->s_mem = objp+colour_off; ++ init_slab_ubps(cachep, slabp); + + return slabp; + } + +-static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) +-{ +- return (kmem_bufctl_t *)(slabp+1); +-} +- + static void cache_init_objs(kmem_cache_t *cachep, + struct slab *slabp, unsigned long ctor_flags) + { +@@ -2213,7 +2015,7 @@ static int cache_grow(kmem_cache_t *cach + /* Get mem for the objs. + * Attempt to allocate a physical page from 'nodeid', + */ +- if (!(objp = kmem_getpages(cachep, flags, nodeid))) ++ if (!(objp = kmem_getpages(cachep, flags & (~__GFP_UBC), nodeid))) + goto failed; + + /* Get slab management. */ +@@ -2552,6 +2354,11 @@ static inline void *__cache_alloc(kmem_c + objp = cache_alloc_debugcheck_after(cachep, flags, objp, + __builtin_return_address(0)); + prefetchw(objp); ++ ++ if (objp && ub_slab_charge(objp, flags)) { ++ kmem_cache_free(cachep, objp); ++ objp = NULL; ++ } + return objp; + } + +@@ -2749,6 +2556,8 @@ static inline void __cache_free(kmem_cac + check_irq_off(); + objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); + ++ ub_slab_uncharge(objp); ++ + /* Make sure we are not freeing a object from another + * node to the array cache on this cpu. + */ +@@ -2884,6 +2693,10 @@ void *kmem_cache_alloc_node(kmem_cache_t + local_irq_restore(save_flags); + ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0)); + ++ if (ptr && ub_slab_charge(ptr, flags)) { ++ kmem_cache_free(cachep, ptr); ++ ptr = NULL; ++ } + return ptr; + } + EXPORT_SYMBOL(kmem_cache_alloc_node); +@@ -3295,6 +3108,7 @@ static void cache_reap(void *unused) + return; + } + ++ {KSTAT_PERF_ENTER(cache_reap) + list_for_each(walk, &cache_chain) { + kmem_cache_t *searchp; + struct list_head* p; +@@ -3359,6 +3173,7 @@ next: + check_irq_on(); + up(&cache_chain_sem); + drain_remote_pages(); ++ KSTAT_PERF_LEAVE(cache_reap)} + /* Setup the next iteration */ + schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); + } +diff -uprN linux-2.6.15.orig/mm/swap.c linux-2.6.15-ve025stab014/mm/swap.c +--- linux-2.6.15.orig/mm/swap.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/swap.c 2006-01-27 14:48:05.000000000 +0300 +@@ -349,7 +349,9 @@ void pagevec_strip(struct pagevec *pvec) + struct page *page = pvec->pages[i]; + + if (PagePrivate(page) && !TestSetPageLocked(page)) { +- try_to_release_page(page, 0); ++ /* need to recheck after lock */ ++ if (page_has_buffers(page)) ++ try_to_release_page(page, 0); + unlock_page(page); + } + } +diff -uprN linux-2.6.15.orig/mm/swap_state.c linux-2.6.15-ve025stab014/mm/swap_state.c +--- linux-2.6.15.orig/mm/swap_state.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/swap_state.c 2006-01-27 14:48:08.000000000 +0300 +@@ -17,6 +17,8 @@ + + #include <asm/pgtable.h> + ++#include <ub/ub_vmpages.h> ++ + /* + * swapper_space is a fiction, retained to simplify the path through + * vmscan's shrink_list, to make sync_page look nicer, and to allow +@@ -51,6 +53,7 @@ static struct { + unsigned long noent_race; + unsigned long exist_race; + } swap_cache_info; ++EXPORT_SYMBOL(swap_cache_info); + + void show_swap_cache_info(void) + { +@@ -149,7 +152,14 @@ int add_to_swap(struct page * page) + BUG(); + + for (;;) { +- entry = get_swap_page(); ++ struct user_beancounter *ub; ++ ++ ub = pb_grab_page_ub(page); ++ if (IS_ERR(ub)) ++ return 0; ++ ++ entry = get_swap_page(ub); ++ put_beancounter(ub); + if (!entry.val) + return 0; + +diff -uprN linux-2.6.15.orig/mm/swapfile.c linux-2.6.15-ve025stab014/mm/swapfile.c +--- linux-2.6.15.orig/mm/swapfile.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/swapfile.c 2006-01-27 14:48:08.000000000 +0300 +@@ -31,6 +31,8 @@ + #include <asm/tlbflush.h> + #include <linux/swapops.h> + ++#include <ub/ub_vmpages.h> ++ + DEFINE_SPINLOCK(swap_lock); + unsigned int nr_swapfiles; + long total_swap_pages; +@@ -170,7 +172,7 @@ no_page: + return 0; + } + +-swp_entry_t get_swap_page(void) ++swp_entry_t get_swap_page(struct user_beancounter *ub) + { + struct swap_info_struct *si; + pgoff_t offset; +@@ -200,6 +202,7 @@ swp_entry_t get_swap_page(void) + offset = scan_swap_map(si); + if (offset) { + spin_unlock(&swap_lock); ++ ub_swapentry_inc(si, offset, ub); + return swp_entry(type, offset); + } + next = swap_list.next; +@@ -255,6 +258,7 @@ static int swap_entry_free(struct swap_i + count--; + p->swap_map[offset] = count; + if (!count) { ++ ub_swapentry_dec(p, offset); + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) +@@ -401,11 +405,18 @@ void free_swap_and_cache(swp_entry_t ent + * force COW, vm_page_prot omits write permission from any private vma. + */ + static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, +- unsigned long addr, swp_entry_t entry, struct page *page) ++ unsigned long addr, swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { +- inc_mm_counter(vma->vm_mm, anon_rss); ++ struct mm_struct *mm; ++ ++ mm = vma->vm_mm; ++ inc_mm_counter(mm, anon_rss); ++ inc_vma_rss(vma); ++ ub_unused_privvm_dec(mm, vma); ++ pb_add_list_ref(page, mm, pb); + get_page(page); +- set_pte_at(vma->vm_mm, addr, pte, ++ set_pte_at(mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); + page_add_anon_rmap(page, vma, addr); + swap_free(entry); +@@ -418,7 +429,8 @@ static void unuse_pte(struct vm_area_str + + static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, +- swp_entry_t entry, struct page *page) ++ swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { + pte_t swp_pte = swp_entry_to_pte(entry); + pte_t *pte; +@@ -432,7 +444,7 @@ static int unuse_pte_range(struct vm_are + * Test inline before going to call unuse_pte. + */ + if (unlikely(pte_same(*pte, swp_pte))) { +- unuse_pte(vma, pte++, addr, entry, page); ++ unuse_pte(vma, pte++, addr, entry, page, pb); + found = 1; + break; + } +@@ -443,7 +455,8 @@ static int unuse_pte_range(struct vm_are + + static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, +- swp_entry_t entry, struct page *page) ++ swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { + pmd_t *pmd; + unsigned long next; +@@ -453,7 +466,7 @@ static inline int unuse_pmd_range(struct + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; +- if (unuse_pte_range(vma, pmd, addr, next, entry, page)) ++ if (unuse_pte_range(vma, pmd, addr, next, entry, page, pb)) + return 1; + } while (pmd++, addr = next, addr != end); + return 0; +@@ -461,7 +474,8 @@ static inline int unuse_pmd_range(struct + + static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, +- swp_entry_t entry, struct page *page) ++ swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { + pud_t *pud; + unsigned long next; +@@ -471,14 +485,15 @@ static inline int unuse_pud_range(struct + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; +- if (unuse_pmd_range(vma, pud, addr, next, entry, page)) ++ if (unuse_pmd_range(vma, pud, addr, next, entry, page, pb)) + return 1; + } while (pud++, addr = next, addr != end); + return 0; + } + + static int unuse_vma(struct vm_area_struct *vma, +- swp_entry_t entry, struct page *page) ++ swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { + pgd_t *pgd; + unsigned long addr, end, next; +@@ -499,14 +514,15 @@ static int unuse_vma(struct vm_area_stru + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; +- if (unuse_pud_range(vma, pgd, addr, next, entry, page)) ++ if (unuse_pud_range(vma, pgd, addr, next, entry, page, pb)) + return 1; + } while (pgd++, addr = next, addr != end); + return 0; + } + + static int unuse_mm(struct mm_struct *mm, +- swp_entry_t entry, struct page *page) ++ swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { + struct vm_area_struct *vma; + +@@ -521,7 +537,7 @@ static int unuse_mm(struct mm_struct *mm + lock_page(page); + } + for (vma = mm->mmap; vma; vma = vma->vm_next) { +- if (vma->anon_vma && unuse_vma(vma, entry, page)) ++ if (vma->anon_vma && unuse_vma(vma, entry, page, pb)) + break; + } + up_read(&mm->mmap_sem); +@@ -587,6 +603,7 @@ static int try_to_unuse(unsigned int typ + int retval = 0; + int reset_overflow = 0; + int shmem; ++ struct page_beancounter *pb; + + /* + * When searching mms for an entry, a good strategy is to +@@ -638,6 +655,13 @@ static int try_to_unuse(unsigned int typ + break; + } + ++ pb = NULL; ++ if (pb_alloc_all(&pb)) { ++ page_cache_release(page); ++ retval = -ENOMEM; ++ break; ++ } ++ + /* + * Don't hold on to start_mm if it looks like exiting. + */ +@@ -671,7 +695,7 @@ static int try_to_unuse(unsigned int typ + if (start_mm == &init_mm) + shmem = shmem_unuse(entry, page); + else +- retval = unuse_mm(start_mm, entry, page); ++ retval = unuse_mm(start_mm, entry, page, &pb); + } + if (*swap_map > 1) { + int set_start_mm = (*swap_map >= swcount); +@@ -703,7 +727,7 @@ static int try_to_unuse(unsigned int typ + set_start_mm = 1; + shmem = shmem_unuse(entry, page); + } else +- retval = unuse_mm(mm, entry, page); ++ retval = unuse_mm(mm, entry, page, &pb); + if (set_start_mm && *swap_map < swcount) { + mmput(new_start_mm); + atomic_inc(&mm->mm_users); +@@ -717,6 +741,8 @@ static int try_to_unuse(unsigned int typ + mmput(start_mm); + start_mm = new_start_mm; + } ++ ++ pb_free_list(&pb); + if (retval) { + unlock_page(page); + page_cache_release(page); +@@ -1062,6 +1088,10 @@ asmlinkage long sys_swapoff(const char _ + int i, type, prev; + int err; + ++ /* VE admin check is just to be on the safe side, the admin may affect ++ * swaps only if he has access to special, i.e. if he has been granted ++ * access to the block device or if the swap file is in the area ++ * visible to him. */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + +@@ -1161,6 +1191,7 @@ asmlinkage long sys_swapoff(const char _ + spin_unlock(&swap_lock); + up(&swapon_sem); + vfree(swap_map); ++ ub_swap_fini(p); + inode = mapping->host; + if (S_ISBLK(inode->i_mode)) { + struct block_device *bdev = I_BDEV(inode); +@@ -1519,6 +1550,11 @@ asmlinkage long sys_swapon(const char __ + goto bad_swap; + } + ++ if (ub_swap_init(p, maxpages)) { ++ error = -ENOMEM; ++ goto bad_swap; ++ } ++ + down(&swapon_sem); + spin_lock(&swap_lock); + p->flags = SWP_ACTIVE; +diff -uprN linux-2.6.15.orig/mm/vmalloc.c linux-2.6.15-ve025stab014/mm/vmalloc.c +--- linux-2.6.15.orig/mm/vmalloc.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/vmalloc.c 2006-01-27 14:48:08.000000000 +0300 +@@ -20,6 +20,8 @@ + #include <asm/uaccess.h> + #include <asm/tlbflush.h> + ++#include <ub/ub_debug.h> ++ + + DEFINE_RWLOCK(vmlist_lock); + struct vm_struct *vmlist; +@@ -256,6 +258,66 @@ struct vm_struct *get_vm_area_node(unsig + return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node); + } + ++struct vm_struct * get_vm_area_best(unsigned long size, unsigned long flags) ++{ ++ unsigned long addr, best_addr, delta, best_delta; ++ struct vm_struct **p, **best_p, *tmp, *area; ++ ++ area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); ++ if (!area) ++ return NULL; ++ ++ size += PAGE_SIZE; /* one-page gap at the end */ ++ addr = VMALLOC_START; ++ best_addr = 0UL; ++ best_p = NULL; ++ best_delta = PAGE_ALIGN(VMALLOC_END) - VMALLOC_START; ++ ++ write_lock(&vmlist_lock); ++ for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { ++ if ((size + addr) < addr) ++ break; ++ delta = (unsigned long) tmp->addr - (size + addr); ++ if (delta < best_delta) { ++ best_delta = delta; ++ best_addr = addr; ++ best_p = p; ++ } ++ addr = tmp->size + (unsigned long) tmp->addr; ++ if (addr > VMALLOC_END-size) ++ break; ++ } ++ ++ if (!tmp) { ++ /* check free area after list end */ ++ delta = (unsigned long) PAGE_ALIGN(VMALLOC_END) - (size + addr); ++ if (delta < best_delta) { ++ best_delta = delta; ++ best_addr = addr; ++ best_p = p; ++ } ++ } ++ if (best_addr) { ++ area->flags = flags; ++ /* allocate at the end of this area */ ++ area->addr = (void *)(best_addr + best_delta); ++ area->size = size; ++ area->next = *best_p; ++ area->pages = NULL; ++ area->nr_pages = 0; ++ area->phys_addr = 0; ++ *best_p = area; ++ /* check like in __vunmap */ ++ WARN_ON((PAGE_SIZE - 1) & (unsigned long)area->addr); ++ } else { ++ kfree(area); ++ area = NULL; ++ } ++ write_unlock(&vmlist_lock); ++ ++ return area; ++} ++ + /* Caller must hold vmlist_lock */ + struct vm_struct *__remove_vm_area(void *addr) + { +@@ -296,7 +358,7 @@ struct vm_struct *remove_vm_area(void *a + return v; + } + +-void __vunmap(void *addr, int deallocate_pages) ++void __vunmap(void *addr, int deallocate_pages, int uncharge) + { + struct vm_struct *area; + +@@ -320,6 +382,8 @@ void __vunmap(void *addr, int deallocate + if (deallocate_pages) { + int i; + ++ if (uncharge) ++ dec_vmalloc_charged(area); + for (i = 0; i < area->nr_pages; i++) { + if (unlikely(!area->pages[i])) + BUG(); +@@ -350,7 +414,7 @@ void __vunmap(void *addr, int deallocate + void vfree(void *addr) + { + BUG_ON(in_interrupt()); +- __vunmap(addr, 1); ++ __vunmap(addr, 1, 1); + } + EXPORT_SYMBOL(vfree); + +@@ -367,7 +431,7 @@ EXPORT_SYMBOL(vfree); + void vunmap(void *addr) + { + BUG_ON(in_interrupt()); +- __vunmap(addr, 0); ++ __vunmap(addr, 0, 0); + } + EXPORT_SYMBOL(vunmap); + +@@ -439,10 +503,12 @@ void *__vmalloc_area_node(struct vm_stru + + if (map_vm_area(area, prot, &pages)) + goto fail; ++ ++ inc_vmalloc_charged(area, gfp_mask); + return area->addr; + + fail: +- vfree(area->addr); ++ __vunmap(area->addr, 1, 0); + return NULL; + } + +@@ -486,6 +552,21 @@ void *__vmalloc(unsigned long size, gfp_ + } + EXPORT_SYMBOL(__vmalloc); + ++static void *____vmalloc(unsigned long size, gfp_t mask, pgprot_t prot) ++{ ++ struct vm_struct *area; ++ ++ size = PAGE_ALIGN(size); ++ if (!size || (size >> PAGE_SHIFT) > num_physpages) ++ return NULL; ++ ++ area = get_vm_area_best(size, VM_ALLOC); ++ if (!area) ++ return NULL; ++ ++ return __vmalloc_area_node(area, mask, prot, -1); ++} ++ + /** + * vmalloc - allocate virtually contiguous memory + * +@@ -503,6 +584,20 @@ void *vmalloc(unsigned long size) + } + EXPORT_SYMBOL(vmalloc); + ++void *vmalloc_best(unsigned long size) ++{ ++ return ____vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); ++} ++ ++EXPORT_SYMBOL(vmalloc_best); ++ ++void *ub_vmalloc_best(unsigned long size) ++{ ++ return ____vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL); ++} ++ ++EXPORT_SYMBOL(ub_vmalloc_best); ++ + /** + * vmalloc_node - allocate memory on a specific node + * +@@ -631,3 +726,37 @@ finished: + read_unlock(&vmlist_lock); + return buf - buf_start; + } ++ ++void vprintstat(void) ++{ ++ struct vm_struct *p, *last_p = NULL; ++ unsigned long addr, size, free_size, max_free_size; ++ int num; ++ ++ addr = VMALLOC_START; ++ size = max_free_size = 0; ++ num = 0; ++ ++ read_lock(&vmlist_lock); ++ for (p = vmlist; p; p = p->next) { ++ free_size = (unsigned long)p->addr - addr; ++ if (free_size > max_free_size) ++ max_free_size = free_size; ++ addr = (unsigned long)p->addr + p->size; ++ size += p->size; ++ ++num; ++ last_p = p; ++ } ++ if (last_p) { ++ free_size = VMALLOC_END - ++ ((unsigned long)last_p->addr + last_p->size); ++ if (free_size > max_free_size) ++ max_free_size = free_size; ++ } ++ read_unlock(&vmlist_lock); ++ ++ printk("VMALLOC Used: %luKB Total: %luKB Entries: %d\n" ++ " Max_Free: %luKB Start: %lx End: %lx\n", ++ size/1024, (VMALLOC_END - VMALLOC_START)/1024, num, ++ max_free_size/1024, VMALLOC_START, VMALLOC_END); ++} +diff -uprN linux-2.6.15.orig/mm/vmscan.c linux-2.6.15-ve025stab014/mm/vmscan.c +--- linux-2.6.15.orig/mm/vmscan.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/mm/vmscan.c 2006-01-27 14:48:08.000000000 +0300 +@@ -39,6 +39,8 @@ + + #include <linux/swapops.h> + ++#include <ub/ub_oom.h> ++ + /* possible outcome of pageout() */ + typedef enum { + /* failed to write page out, page is locked */ +@@ -82,6 +84,9 @@ struct scan_control { + * In this context, it doesn't matter that we scan the + * whole list at once. */ + int swap_cluster_max; ++#ifdef CONFIG_USER_RESOURCE ++ struct oom_freeing_stat oom_stat; ++#endif + }; + + /* +@@ -186,12 +191,14 @@ EXPORT_SYMBOL(remove_shrinker); + * + * Returns the number of slab objects which we shrunk. + */ +-static int shrink_slab(unsigned long scanned, gfp_t gfp_mask, ++static int shrink_slab(struct scan_control *sc, gfp_t gfp_mask, + unsigned long lru_pages) + { ++ unsigned long scanned; + struct shrinker *shrinker; + int ret = 0; + ++ scanned = sc->nr_scanned; + if (scanned == 0) + scanned = SWAP_CLUSTER_MAX; + +@@ -237,7 +244,7 @@ static int shrink_slab(unsigned long sca + ret += nr_before - shrink_ret; + mod_page_state(slabs_scanned, this_scan); + total_scan -= this_scan; +- ++ ub_oom_inc(sc, slab, shrink_ret); + cond_resched(); + } + +@@ -434,6 +441,7 @@ static int shrink_list(struct list_head + goto keep_locked; + if (!add_to_swap(page)) + goto activate_locked; ++ ub_oom_inc(sc, swap, 1); + } + #endif /* CONFIG_SWAP */ + +@@ -471,6 +479,7 @@ static int shrink_list(struct list_head + case PAGE_ACTIVATE: + goto activate_locked; + case PAGE_SUCCESS: ++ ub_oom_inc(sc, write, 1); + if (PageWriteback(page) || PageDirty(page)) + goto keep; + /* +@@ -658,6 +667,7 @@ static void shrink_cache(struct zone *zo + else + mod_page_state_zone(zone, pgscan_direct, nr_scan); + nr_freed = shrink_list(&page_list, sc); ++ ub_oom_inc(sc, free, nr_freed); + if (current_is_kswapd()) + mod_page_state(kswapd_steal, nr_freed); + mod_page_state_zone(zone, pgsteal, nr_freed); +@@ -722,6 +732,7 @@ refill_inactive_zone(struct zone *zone, + long distress; + long swap_tendency; + ++ KSTAT_PERF_ENTER(refill_inact) + lru_add_drain(); + spin_lock_irq(&zone->lru_lock); + pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, +@@ -830,6 +841,7 @@ refill_inactive_zone(struct zone *zone, + + mod_page_state_zone(zone, pgrefill, pgscanned); + mod_page_state(pgdeactivate, pgdeactivate); ++ KSTAT_PERF_LEAVE(refill_inact); + } + + /* +@@ -950,10 +962,14 @@ int try_to_free_pages(struct zone **zone + unsigned long lru_pages = 0; + int i; + ++ KSTAT_PERF_ENTER(ttfp); ++ ++ memset(&sc, 0, sizeof(sc)); + sc.gfp_mask = gfp_mask; + sc.may_writepage = 0; + sc.may_swap = 1; + ++ ub_oom_init(); + inc_page_state(allocstall); + + for (i = 0; zones[i] != NULL; i++) { +@@ -975,7 +991,7 @@ int try_to_free_pages(struct zone **zone + if (!priority) + disable_swap_token(); + shrink_caches(zones, &sc); +- shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); ++ shrink_slab(&sc, gfp_mask, lru_pages); + if (reclaim_state) { + sc.nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; +@@ -1012,7 +1028,8 @@ out: + + zone->prev_priority = zone->temp_priority; + } +- return ret; ++ KSTAT_PERF_LEAVE(ttfp); ++ return ret | ub_oom_did_progress(&sc); + } + + /* +@@ -1143,7 +1160,7 @@ scan: + shrink_zone(zone, &sc); + atomic_dec(&zone->reclaim_in_progress); + reclaim_state->reclaimed_slab = 0; +- nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, ++ nr_slab = shrink_slab(&sc, GFP_KERNEL, + lru_pages); + sc.nr_reclaimed += reclaim_state->reclaimed_slab; + total_reclaimed += sc.nr_reclaimed; +@@ -1346,7 +1363,8 @@ static int __init kswapd_init(void) + swap_setup(); + for_each_pgdat(pgdat) + pgdat->kswapd +- = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); ++ = find_task_by_pid_all(kernel_thread(kswapd, ++ pgdat, CLONE_KERNEL)); + total_memory = nr_free_pagecache_pages(); + hotcpu_notifier(cpu_callback, 0); + return 0; +diff -uprN linux-2.6.15.orig/net/core/datagram.c linux-2.6.15-ve025stab014/net/core/datagram.c +--- linux-2.6.15.orig/net/core/datagram.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/core/datagram.c 2006-01-27 14:48:06.000000000 +0300 +@@ -55,6 +55,8 @@ + #include <net/sock.h> + #include <net/tcp_states.h> + ++#include <ub/ub_net.h> ++ + /* + * Is a socket 'connection oriented' ? + */ +@@ -432,6 +434,7 @@ unsigned int datagram_poll(struct file * + { + struct sock *sk = sock->sk; + unsigned int mask; ++ int no_ubc_space; + + poll_wait(file, sk->sk_sleep, wait); + mask = 0; +@@ -439,8 +442,14 @@ unsigned int datagram_poll(struct file * + /* exceptional events? */ + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + mask |= POLLERR; +- if (sk->sk_shutdown == SHUTDOWN_MASK) ++ if (sk->sk_shutdown == SHUTDOWN_MASK) { ++ no_ubc_space = 0; + mask |= POLLHUP; ++ } else { ++ no_ubc_space = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH); ++ if (no_ubc_space) ++ ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH); ++ } + + /* readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue) || +@@ -457,7 +466,7 @@ unsigned int datagram_poll(struct file * + } + + /* writable? */ +- if (sock_writeable(sk)) ++ if (!no_ubc_space && sock_writeable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + else + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); +diff -uprN linux-2.6.15.orig/net/core/dev.c linux-2.6.15-ve025stab014/net/core/dev.c +--- linux-2.6.15.orig/net/core/dev.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/core/dev.c 2006-01-27 14:48:08.000000000 +0300 +@@ -114,6 +114,10 @@ + #include <net/iw_handler.h> + #endif /* CONFIG_NET_RADIO */ + #include <asm/current.h> ++#include <ub/beancounter.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_mem.h> + + /* + * The list of packet types we will receive (as opposed to discard) +@@ -166,25 +170,40 @@ static struct list_head ptype_all; /* T + * unregister_netdevice(), which must be called with the rtnl + * semaphore held. + */ ++#ifdef CONFIG_VE ++#define dev_tail (get_exec_env()->_net_dev_tail) ++#else + struct net_device *dev_base; + static struct net_device **dev_tail = &dev_base; ++EXPORT_SYMBOL(dev_base); ++#endif + DEFINE_RWLOCK(dev_base_lock); + +-EXPORT_SYMBOL(dev_base); + EXPORT_SYMBOL(dev_base_lock); + ++#ifdef CONFIG_VE ++#define MAX_UNMOVABLE_NETDEVICES (8*4096) ++static uint8_t unmovable_ifindex_list[MAX_UNMOVABLE_NETDEVICES/8]; ++static LIST_HEAD(dev_global_list); ++#endif ++ + #define NETDEV_HASHBITS 8 + static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS]; + static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS]; + +-static inline struct hlist_head *dev_name_hash(const char *name) ++struct hlist_head *dev_name_hash(const char *name, struct ve_struct *env) + { +- unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); ++ unsigned hash; ++ if (!ve_is_super(env)) ++ return visible_dev_head(env); ++ hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); + return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)]; + } + +-static inline struct hlist_head *dev_index_hash(int ifindex) ++struct hlist_head *dev_index_hash(int ifindex, struct ve_struct *env) + { ++ if (!ve_is_super(env)) ++ return visible_dev_index_head(env); + return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)]; + } + +@@ -468,7 +487,7 @@ struct net_device *__dev_get_by_name(con + { + struct hlist_node *p; + +- hlist_for_each(p, dev_name_hash(name)) { ++ hlist_for_each(p, dev_name_hash(name, get_exec_env())) { + struct net_device *dev + = hlist_entry(p, struct net_device, name_hlist); + if (!strncmp(dev->name, name, IFNAMSIZ)) +@@ -501,6 +520,32 @@ struct net_device *dev_get_by_name(const + } + + /** ++ * __dev_global_get_by_name - find a device by its name in dev_global_list ++ * @name: name to find ++ * ++ * Find an interface by name. Must be called under RTNL semaphore ++ * If the name is found a pointer to the device ++ * is returned. If the name is not found then %NULL is returned. The ++ * reference counters are not incremented so the caller must be ++ * careful with locks. ++ */ ++ ++#ifdef CONFIG_VE ++struct net_device *__dev_global_get_by_name(const char *name) ++{ ++ struct net_device *dev; ++ /* It's called relatively rarely */ ++ list_for_each_entry(dev, &dev_global_list, dev_global_list_entry) { ++ if (strncmp(dev->name, name, IFNAMSIZ) == 0) ++ return dev; ++ } ++ return NULL; ++} ++#else /* CONFIG_VE */ ++#define __dev_global_get_by_name(name) __dev_get_by_name(name) ++#endif /* CONFIG_VE */ ++ ++/** + * __dev_get_by_index - find a device by its ifindex + * @ifindex: index of device + * +@@ -515,7 +560,7 @@ struct net_device *__dev_get_by_index(in + { + struct hlist_node *p; + +- hlist_for_each(p, dev_index_hash(ifindex)) { ++ hlist_for_each(p, dev_index_hash(ifindex, get_exec_env())) { + struct net_device *dev + = hlist_entry(p, struct net_device, index_hlist); + if (dev->ifindex == ifindex) +@@ -634,6 +679,23 @@ static int dev_valid_name(const char *na + || strchr(name, '/')); + } + ++static inline void __dev_check_name(const char *dev_name, const char *name, ++ long *inuse, const int max_netdevices) ++{ ++ int i = 0; ++ char buf[IFNAMSIZ]; ++ ++ if (!sscanf(dev_name, name, &i)) ++ return; ++ if (i < 0 || i >= max_netdevices) ++ return; ++ ++ /* avoid cases where sscanf is not exact inverse of printf */ ++ snprintf(buf, sizeof(buf), name, i); ++ if (!strncmp(buf, dev_name, IFNAMSIZ)) ++ set_bit(i, inuse); ++} ++ + /** + * dev_alloc_name - allocate a name for a device + * @dev: device +@@ -670,16 +732,20 @@ int dev_alloc_name(struct net_device *de + if (!inuse) + return -ENOMEM; + +- for (d = dev_base; d; d = d->next) { +- if (!sscanf(d->name, name, &i)) +- continue; +- if (i < 0 || i >= max_netdevices) +- continue; +- +- /* avoid cases where sscanf is not exact inverse of printf */ +- snprintf(buf, sizeof(buf), name, i); +- if (!strncmp(buf, d->name, IFNAMSIZ)) +- set_bit(i, inuse); ++#ifdef CONFIG_VE ++ if (ve_is_super(get_exec_env())) { ++ list_for_each_entry(d, &dev_global_list, ++ dev_global_list_entry) { ++ __dev_check_name(d->name, name, inuse, ++ max_netdevices); ++ } ++ } else ++#endif ++ { ++ for (d = dev_base; d; d = d->next) { ++ __dev_check_name(d->name, name, inuse, ++ max_netdevices); ++ } + } + + i = find_first_zero_bit(inuse, max_netdevices); +@@ -687,7 +753,11 @@ int dev_alloc_name(struct net_device *de + } + + snprintf(buf, sizeof(buf), name, i); +- if (!__dev_get_by_name(buf)) { ++ if (ve_is_super(get_exec_env())) ++ d = __dev_global_get_by_name(buf); ++ else ++ d = __dev_get_by_name(buf); ++ if (d == NULL) { + strlcpy(dev->name, buf, IFNAMSIZ); + return i; + } +@@ -720,13 +790,14 @@ int dev_change_name(struct net_device *d + if (!dev_valid_name(newname)) + return -EINVAL; + ++ /* Rename of devices in VE is prohibited by CAP_NET_ADMIN */ + if (strchr(newname, '%')) { + err = dev_alloc_name(dev, newname); + if (err < 0) + return err; + strcpy(newname, dev->name); + } +- else if (__dev_get_by_name(newname)) ++ else if (__dev_global_get_by_name(newname)) + return -EEXIST; + else + strlcpy(dev->name, newname, IFNAMSIZ); +@@ -734,7 +805,8 @@ int dev_change_name(struct net_device *d + err = class_device_rename(&dev->class_dev, dev->name); + if (!err) { + hlist_del(&dev->name_hlist); +- hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name)); ++ hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name, ++ get_exec_env())); + notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); + } + +@@ -1296,6 +1368,25 @@ int dev_queue_xmit(struct sk_buff *skb) + skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); + #endif + if (q->enqueue) { ++ struct user_beancounter *ub; ++ ++ ub = netdev_bc(dev)->exec_ub; ++ /* the skb CAN be already charged if it transmitted via ++ * something like bonding device */ ++ if (ub && (skb_bc(skb)->resource == 0)) { ++ unsigned long chargesize; ++ chargesize = skb_charge_fullsize(skb); ++ if (charge_beancounter(ub, UB_OTHERSOCKBUF, ++ chargesize, UB_SOFT)) { ++ rcu_read_unlock(); ++ rc = -ENOMEM; ++ goto out_kfree_skb; ++ } ++ skb_bc(skb)->ub = ub; ++ skb_bc(skb)->charged = chargesize; ++ skb_bc(skb)->resource = UB_OTHERSOCKBUF; ++ } ++ + /* Grab device queue */ + spin_lock(&dev->queue_lock); + +@@ -1582,6 +1673,7 @@ int netif_receive_skb(struct sk_buff *sk + struct net_device *orig_dev; + int ret = NET_RX_DROP; + unsigned short type; ++ struct ve_struct *old_env; + + /* if we've gotten here through NAPI, check netpoll */ + if (skb->dev->poll && netpoll_rx(skb)) +@@ -1600,6 +1692,17 @@ int netif_receive_skb(struct sk_buff *sk + skb->h.raw = skb->nh.raw = skb->data; + skb->mac_len = skb->nh.raw - skb->mac.raw; + ++#ifdef CONFIG_VE ++ /* ++ * Skb might be alloced in another VE context, than its device works. ++ * So, set the correct owner_env. ++ */ ++ skb->owner_env = skb->dev->owner_env; ++ BUG_ON(skb->owner_env == NULL); ++#endif ++ ++ old_env = set_exec_env(VE_OWNER_SKB(skb)); ++ + pt_prev = NULL; + + rcu_read_lock(); +@@ -1665,6 +1768,7 @@ ncls: + + out: + rcu_read_unlock(); ++ (void)set_exec_env(old_env); + return ret; + } + +@@ -2040,7 +2144,7 @@ static int __init dev_proc_init(void) + { + int rc = -ENOMEM; + +- if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops)) ++ if (!proc_glob_fops_create("net/dev", S_IRUGO, &dev_seq_fops)) + goto out; + if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops)) + goto out_dev; +@@ -2052,7 +2156,7 @@ out: + out_softnet: + proc_net_remove("softnet_stat"); + out_dev: +- proc_net_remove("dev"); ++ remove_proc_glob_entry("net/dev", NULL); + goto out; + } + #else +@@ -2117,6 +2221,9 @@ void dev_set_promiscuity(struct net_devi + dev->flags &= ~IFF_PROMISC; + else + dev->flags |= IFF_PROMISC; ++ /* Promiscous mode on these devices does not mean anything */ ++ if (dev->flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) ++ return; + if (dev->flags != old_flags) { + dev_mc_upload(dev); + printk(KERN_INFO "device %s %s promiscuous mode\n", +@@ -2531,9 +2638,28 @@ int dev_ioctl(unsigned int cmd, void __u + * - require strict serialization. + * - do not return a value + */ ++ case SIOCSIFMTU: ++ if (!capable(CAP_NET_ADMIN) && ++ !capable(CAP_VE_NET_ADMIN)) ++ return -EPERM; ++ dev_load(ifr.ifr_name); ++ rtnl_lock(); ++ if (!ve_is_super(get_exec_env())) { ++ struct net_device *dev; ++ ret = -ENODEV; ++ if ((dev = __dev_get_by_name(ifr.ifr_name)) == NULL) ++ goto out_set_mtu_unlock; ++ ret = -EPERM; ++ if (ifr.ifr_mtu > dev->orig_mtu) ++ goto out_set_mtu_unlock; ++ } ++ ret = dev_ifsioc(&ifr, cmd); ++out_set_mtu_unlock: ++ rtnl_unlock(); ++ return ret; ++ + case SIOCSIFFLAGS: + case SIOCSIFMETRIC: +- case SIOCSIFMTU: + case SIOCSIFMAP: + case SIOCSIFHWADDR: + case SIOCSIFSLAVE: +@@ -2614,20 +2740,73 @@ int dev_ioctl(unsigned int cmd, void __u + * dev_new_index - allocate an ifindex + * + * Returns a suitable unique value for a new device interface +- * number. The caller must hold the rtnl semaphore or the ++ * number. The caller must hold the rtnl semaphore or the + * dev_base_lock to be sure it remains unique. ++ * ++ * Note: dev->name must be valid on entrance + */ +-static int dev_new_index(void) ++static int dev_ve_new_index(void) + { +- static int ifindex; ++#ifdef CONFIG_VE ++ int *ifindex = &get_exec_env()->ifindex; ++ int delta = 2; ++#else ++ static int s_ifindex; ++ int *ifindex = &s_ifindex; ++ int delta = 1; ++#endif + for (;;) { +- if (++ifindex <= 0) +- ifindex = 1; +- if (!__dev_get_by_index(ifindex)) +- return ifindex; ++ *ifindex += delta; ++ if (*ifindex <= 0) ++ *ifindex = 1; ++ if (!__dev_get_by_index(*ifindex)) ++ return *ifindex; + } + } + ++#ifdef CONFIG_VE ++static int dev_glb_new_index(void) ++{ ++ int i; ++ ++ i = find_first_zero_bit((long*)unmovable_ifindex_list, ++ MAX_UNMOVABLE_NETDEVICES); ++ ++ if (i == MAX_UNMOVABLE_NETDEVICES) ++ return -EMFILE; ++ ++ __set_bit(i, (long*)unmovable_ifindex_list); ++ return (i + 1) * 2; ++} ++#endif ++ ++static void dev_glb_free_index(struct net_device *dev) ++{ ++#ifdef CONFIG_VE ++ int bit; ++ ++ bit = dev->ifindex / 2 - 1; ++ BUG_ON(bit >= MAX_UNMOVABLE_NETDEVICES); ++ __clear_bit(bit, (long*)unmovable_ifindex_list); ++#endif ++} ++ ++static int dev_new_index(struct net_device *dev) ++{ ++#ifdef CONFIG_VE ++ if (ve_is_super(get_exec_env()) && ve_is_dev_movable(dev)) ++ return dev_glb_new_index(); ++#endif ++ ++ return dev_ve_new_index(); ++} ++ ++static void dev_free_index(struct net_device *dev) ++{ ++ if ((dev->ifindex % 2) == 0) ++ dev_glb_free_index(dev); ++} ++ + static int dev_boot_phase = 1; + + /* Delayed registration/unregisteration */ +@@ -2670,6 +2849,10 @@ int register_netdevice(struct net_device + /* When net_device's are persistent, this will be fatal. */ + BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); + ++ ret = -EPERM; ++ if (!ve_is_super(get_exec_env()) && ve_is_dev_movable(dev)) ++ goto out; ++ + spin_lock_init(&dev->queue_lock); + spin_lock_init(&dev->xmit_lock); + dev->xmit_lock_owner = -1; +@@ -2689,27 +2872,32 @@ int register_netdevice(struct net_device + if (ret) { + if (ret > 0) + ret = -EIO; +- goto out_err; ++ goto out_free_div; + } + } + + if (!dev_valid_name(dev->name)) { + ret = -EINVAL; +- goto out_err; ++ goto out_free_div; ++ } ++ ++ dev->ifindex = dev_new_index(dev); ++ if (dev->ifindex < 0) { ++ ret = dev->ifindex; ++ goto out_free_div; + } + +- dev->ifindex = dev_new_index(); + if (dev->iflink == -1) + dev->iflink = dev->ifindex; + + /* Check for existence of name */ +- head = dev_name_hash(dev->name); ++ head = dev_name_hash(dev->name, get_exec_env()); + hlist_for_each(p, head) { + struct net_device *d + = hlist_entry(p, struct net_device, name_hlist); + if (!strncmp(d->name, dev->name, IFNAMSIZ)) { + ret = -EEXIST; +- goto out_err; ++ goto out_free_ind; + } + } + +@@ -2761,12 +2949,21 @@ int register_netdevice(struct net_device + set_bit(__LINK_STATE_PRESENT, &dev->state); + + dev->next = NULL; ++ dev->owner_env = get_exec_env(); ++ dev->orig_mtu = dev->mtu; ++ netdev_bc(dev)->owner_ub = get_beancounter(get_exec_ub()); ++ netdev_bc(dev)->exec_ub = get_beancounter(get_exec_ub()); + dev_init_scheduler(dev); ++#ifdef CONFIG_VE ++ if (ve_is_super(get_exec_env())) ++ list_add_tail(&dev->dev_global_list_entry, &dev_global_list); ++#endif + write_lock_bh(&dev_base_lock); + *dev_tail = dev; + dev_tail = &dev->next; + hlist_add_head(&dev->name_hlist, head); +- hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex)); ++ hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex, ++ get_exec_env())); + dev_hold(dev); + dev->reg_state = NETREG_REGISTERING; + write_unlock_bh(&dev_base_lock); +@@ -2780,7 +2977,9 @@ int register_netdevice(struct net_device + + out: + return ret; +-out_err: ++out_free_ind: ++ dev_free_index(dev); ++out_free_div: + free_divert_blk(dev); + goto out; + } +@@ -2826,6 +3025,10 @@ int register_netdev(struct net_device *d + err = register_netdevice(dev); + out: + rtnl_unlock(); ++ if (err == 0 && dev->reg_state != NETREG_REGISTERED) { ++ unregister_netdev(dev); ++ err = -ENOMEM; ++ } + return err; + } + EXPORT_SYMBOL(register_netdev); +@@ -2908,6 +3111,7 @@ void netdev_run_todo(void) + { + struct list_head list = LIST_HEAD_INIT(list); + int err; ++ struct ve_struct *current_env; + + + /* Need to guard against multiple cpu's getting out of order. */ +@@ -2926,22 +3130,30 @@ void netdev_run_todo(void) + list_splice_init(&net_todo_list, &list); + spin_unlock(&net_todo_list_lock); + ++ current_env = get_exec_env(); + while (!list_empty(&list)) { + struct net_device *dev + = list_entry(list.next, struct net_device, todo_list); + list_del(&dev->todo_list); + ++ (void)set_exec_env(dev->owner_env); + switch(dev->reg_state) { + case NETREG_REGISTERING: + err = netdev_register_sysfs(dev); +- if (err) ++ if (err) { + printk(KERN_ERR "%s: failed sysfs registration (%d)\n", + dev->name, err); ++ dev->reg_state = NETREG_REGISTER_ERR; ++ break; ++ } + dev->reg_state = NETREG_REGISTERED; + break; + + case NETREG_UNREGISTERING: + netdev_unregister_sysfs(dev); ++ /* fall through */ ++ ++ case NETREG_REGISTER_ERR: + dev->reg_state = NETREG_UNREGISTERED; + + netdev_wait_allrefs(dev); +@@ -2952,6 +3164,10 @@ void netdev_run_todo(void) + BUG_TRAP(!dev->ip6_ptr); + BUG_TRAP(!dev->dn_ptr); + ++ put_beancounter(netdev_bc(dev)->exec_ub); ++ put_beancounter(netdev_bc(dev)->owner_ub); ++ netdev_bc(dev)->exec_ub = NULL; ++ netdev_bc(dev)->owner_ub = NULL; + + /* It must be the very last action, + * after this 'dev' may point to freed up memory. +@@ -2966,6 +3182,7 @@ void netdev_run_todo(void) + break; + } + } ++ (void)set_exec_env(current_env); + + out: + up(&net_todo_run_mutex); +@@ -2991,7 +3208,7 @@ struct net_device *alloc_netdev(int size + alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; + alloc_size += sizeof_priv + NETDEV_ALIGN_CONST; + +- p = kmalloc(alloc_size, GFP_KERNEL); ++ p = ub_kmalloc(alloc_size, GFP_KERNEL); + if (!p) { + printk(KERN_ERR "alloc_dev: Unable to allocate device.\n"); + return NULL; +@@ -3071,7 +3288,8 @@ int unregister_netdevice(struct net_devi + return -ENODEV; + } + +- BUG_ON(dev->reg_state != NETREG_REGISTERED); ++ BUG_ON(dev->reg_state != NETREG_REGISTERED && ++ dev->reg_state != NETREG_REGISTER_ERR); + + /* If device is running, close it first. */ + if (dev->flags & IFF_UP) +@@ -3087,6 +3305,10 @@ int unregister_netdevice(struct net_devi + dev_tail = dp; + *dp = d->next; + write_unlock_bh(&dev_base_lock); ++#ifdef CONFIG_VE ++ if (ve_is_super(get_exec_env())) ++ list_del(&dev->dev_global_list_entry); ++#endif + break; + } + } +@@ -3096,7 +3318,8 @@ int unregister_netdevice(struct net_devi + return -ENODEV; + } + +- dev->reg_state = NETREG_UNREGISTERING; ++ if (dev->reg_state != NETREG_REGISTER_ERR) ++ dev->reg_state = NETREG_UNREGISTERING; + + synchronize_net(); + +@@ -3120,6 +3343,8 @@ int unregister_netdevice(struct net_devi + /* Notifier chain MUST detach us from master device. */ + BUG_TRAP(!dev->master); + ++ dev_free_index(dev); ++ + free_divert_blk(dev); + + /* Finish processing unregister after unlock */ +@@ -3277,6 +3502,8 @@ EXPORT_SYMBOL(dev_get_by_flags); + EXPORT_SYMBOL(dev_get_by_index); + EXPORT_SYMBOL(dev_get_by_name); + EXPORT_SYMBOL(dev_ioctl); ++EXPORT_SYMBOL(dev_name_hash); ++EXPORT_SYMBOL(dev_index_hash); + EXPORT_SYMBOL(dev_open); + EXPORT_SYMBOL(dev_queue_xmit); + EXPORT_SYMBOL(dev_remove_pack); +diff -uprN linux-2.6.15.orig/net/core/dev_mcast.c linux-2.6.15-ve025stab014/net/core/dev_mcast.c +--- linux-2.6.15.orig/net/core/dev_mcast.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/core/dev_mcast.c 2006-01-27 14:48:08.000000000 +0300 +@@ -290,9 +290,10 @@ static struct file_operations dev_mc_seq + + void __init dev_mcast_init(void) + { +- proc_net_fops_create("dev_mcast", 0, &dev_mc_seq_fops); ++ proc_glob_fops_create("net/dev_mcast", 0, &dev_mc_seq_fops); + } + + EXPORT_SYMBOL(dev_mc_add); + EXPORT_SYMBOL(dev_mc_delete); + EXPORT_SYMBOL(dev_mc_upload); ++EXPORT_SYMBOL(dev_mc_discard); +diff -uprN linux-2.6.15.orig/net/core/dst.c linux-2.6.15-ve025stab014/net/core/dst.c +--- linux-2.6.15.orig/net/core/dst.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/core/dst.c 2006-01-27 14:48:08.000000000 +0300 +@@ -241,13 +241,13 @@ static inline void dst_ifdown(struct dst + dst->input = dst_discard_in; + dst->output = dst_discard_out; + } else { +- dst->dev = &loopback_dev; +- dev_hold(&loopback_dev); ++ dst->dev = &visible_loopback_dev; ++ dev_hold(&visible_loopback_dev); + dev_put(dev); + if (dst->neighbour && dst->neighbour->dev == dev) { +- dst->neighbour->dev = &loopback_dev; ++ dst->neighbour->dev = &visible_loopback_dev; + dev_put(dev); +- dev_hold(&loopback_dev); ++ dev_hold(&visible_loopback_dev); + } + } + } +@@ -260,11 +260,14 @@ static int dst_dev_event(struct notifier + switch (event) { + case NETDEV_UNREGISTER: + case NETDEV_DOWN: +- spin_lock_bh(&dst_lock); ++ local_bh_disable(); ++ dst_run_gc(0); ++ spin_lock(&dst_lock); + for (dst = dst_garbage_list; dst; dst = dst->next) { + dst_ifdown(dst, dev, event != NETDEV_DOWN); + } +- spin_unlock_bh(&dst_lock); ++ spin_unlock(&dst_lock); ++ local_bh_enable(); + break; + } + return NOTIFY_DONE; +diff -uprN linux-2.6.15.orig/net/core/filter.c linux-2.6.15-ve025stab014/net/core/filter.c +--- linux-2.6.15.orig/net/core/filter.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/core/filter.c 2006-01-27 14:48:06.000000000 +0300 +@@ -363,7 +363,7 @@ int sk_attach_filter(struct sock_fprog * + if (fprog->filter == NULL) + return -EINVAL; + +- fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); ++ fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL_UBC); + if (!fp) + return -ENOMEM; + if (copy_from_user(fp->insns, fprog->filter, fsize)) { +diff -uprN linux-2.6.15.orig/net/core/neighbour.c linux-2.6.15-ve025stab014/net/core/neighbour.c +--- linux-2.6.15.orig/net/core/neighbour.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/core/neighbour.c 2006-01-27 14:48:08.000000000 +0300 +@@ -727,6 +727,11 @@ static void neigh_timer_handler(unsigned + struct neighbour *neigh = (struct neighbour *)arg; + unsigned state; + int notify = 0; ++ struct ve_struct *env; ++ struct user_beancounter *ub; ++ ++ env = set_exec_env(neigh->dev->owner_env); ++ ub = set_exec_ub(netdev_bc(neigh->dev)->exec_ub); + + write_lock(&neigh->lock); + +@@ -824,6 +829,8 @@ out: + neigh_app_notify(neigh); + #endif + neigh_release(neigh); ++ (void)set_exec_ub(ub); ++ (void)set_exec_env(env); + } + + int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +@@ -1213,6 +1220,12 @@ static void neigh_proxy_process(unsigned + skb = skb->next; + if (tdif <= 0) { + struct net_device *dev = back->dev; ++ struct ve_struct *env; ++ struct user_beancounter *ub; ++ ++ env = set_exec_env(dev->owner_env); ++ ub = set_exec_ub(netdev_bc(dev)->exec_ub); ++ + __skb_unlink(back, &tbl->proxy_queue); + if (tbl->proxy_redo && netif_running(dev)) + tbl->proxy_redo(back); +@@ -1220,6 +1233,9 @@ static void neigh_proxy_process(unsigned + kfree_skb(back); + + dev_put(dev); ++ ++ (void)set_exec_ub(ub); ++ (void)set_exec_env(env); + } else if (!sched_next || tdif < sched_next) + sched_next = tdif; + } +@@ -1424,6 +1440,9 @@ int neigh_delete(struct sk_buff *skb, st + struct net_device *dev = NULL; + int err = -ENODEV; + ++ if (!ve_is_super(get_exec_env())) ++ return -EACCES; ++ + if (ndm->ndm_ifindex && + (dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) + goto out; +@@ -1475,6 +1494,9 @@ int neigh_add(struct sk_buff *skb, struc + struct net_device *dev = NULL; + int err = -ENODEV; + ++ if (!ve_is_super(get_exec_env())) ++ return -EACCES; ++ + if (ndm->ndm_ifindex && + (dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) + goto out; +@@ -1936,6 +1958,9 @@ int neigh_dump_info(struct sk_buff *skb, + struct neigh_table *tbl; + int t, family, s_t; + ++ if (!ve_is_super(get_exec_env())) ++ return -EACCES; ++ + read_lock(&neigh_tbl_lock); + family = ((struct rtgenmsg *)NLMSG_DATA(cb->nlh))->rtgen_family; + s_t = cb->args[0]; +@@ -2530,11 +2555,17 @@ int neigh_sysctl_register(struct net_dev + int p_id, int pdev_id, char *p_name, + proc_handler *handler, ctl_handler *strategy) + { +- struct neigh_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL); ++ struct neigh_sysctl_table *t; + const char *dev_name_source = NULL; + char *dev_name = NULL; + int err = 0; + ++ /* This function is called from VExx only from devinet_init, ++ and it is does not matter what is returned */ ++ if (!ve_is_super(get_exec_env())) ++ return 0; ++ ++ t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return -ENOBUFS; + memcpy(t, &neigh_sysctl_template, sizeof(*t)); +@@ -2625,6 +2656,8 @@ int neigh_sysctl_register(struct net_dev + + void neigh_sysctl_unregister(struct neigh_parms *p) + { ++ if (!ve_is_super(get_exec_env())) ++ return; + if (p->sysctl_table) { + struct neigh_sysctl_table *t = p->sysctl_table; + p->sysctl_table = NULL; +diff -uprN linux-2.6.15.orig/net/core/net-sysfs.c linux-2.6.15-ve025stab014/net/core/net-sysfs.c +--- linux-2.6.15.orig/net/core/net-sysfs.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/core/net-sysfs.c 2006-01-27 14:48:08.000000000 +0300 +@@ -399,7 +399,8 @@ static void netdev_release(struct class_ + struct net_device *dev + = container_of(cd, struct net_device, class_dev); + +- BUG_ON(dev->reg_state != NETREG_RELEASED); ++ BUG_ON(dev->reg_state != NETREG_RELEASED && ++ dev->reg_state != NETREG_REGISTERING); + + kfree((char *)dev - dev->padded); + } +@@ -411,6 +412,13 @@ static struct class net_class = { + .hotplug = netdev_hotplug, + #endif + }; ++EXPORT_SYMBOL(net_class); ++ ++#ifndef CONFIG_VE ++#define visible_net_class net_class ++#else ++#define visible_net_class (*get_exec_env()->net_class) ++#endif + + void netdev_unregister_sysfs(struct net_device * net) + { +@@ -435,7 +443,7 @@ int netdev_register_sysfs(struct net_dev + struct class_device_attribute *attr; + int ret; + +- class_dev->class = &net_class; ++ class_dev->class = &visible_net_class; + class_dev->class_data = net; + + strlcpy(class_dev->class_id, net->name, BUS_ID_SIZE); +@@ -468,12 +476,21 @@ out_cleanup: + out_unreg: + printk(KERN_WARNING "%s: sysfs attribute registration failed %d\n", + net->name, ret); +- class_device_unregister(class_dev); ++ /* put is called in free_netdev() */ ++ class_device_del(class_dev); + out: + return ret; + } + ++void prepare_sysfs_netdev(void) ++{ ++#ifdef CONFIG_VE ++ get_ve0()->net_class = &net_class; ++#endif ++} ++ + int netdev_sysfs_init(void) + { ++ prepare_sysfs_netdev(); + return class_register(&net_class); + } +diff -uprN linux-2.6.15.orig/net/core/rtnetlink.c linux-2.6.15-ve025stab014/net/core/rtnetlink.c +--- linux-2.6.15.orig/net/core/rtnetlink.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/core/rtnetlink.c 2006-01-27 14:48:08.000000000 +0300 +@@ -434,6 +434,8 @@ static int rtnetlink_dump_all(struct sk_ + if (rtnetlink_links[idx] == NULL || + rtnetlink_links[idx][type].dumpit == NULL) + continue; ++ if (vz_security_proto_check(idx, 0, 0)) ++ continue; + if (idx > s_idx) + memset(&cb->args[0], 0, sizeof(cb->args)); + if (rtnetlink_links[idx][type].dumpit(skb, cb)) +@@ -501,7 +503,7 @@ rtnetlink_rcv_msg(struct sk_buff *skb, s + return 0; + + family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; +- if (family >= NPROTO) { ++ if (family >= NPROTO || vz_security_proto_check(family, 0, 0)) { + *errp = -EAFNOSUPPORT; + return -1; + } +diff -uprN linux-2.6.15.orig/net/core/scm.c linux-2.6.15-ve025stab014/net/core/scm.c +--- linux-2.6.15.orig/net/core/scm.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/core/scm.c 2006-01-27 14:48:08.000000000 +0300 +@@ -33,6 +33,7 @@ + #include <net/compat.h> + #include <net/scm.h> + ++#include <ub/ub_mem.h> + + /* + * Only allow a user to send credentials, that they could set with +@@ -41,7 +42,9 @@ + + static __inline__ int scm_check_creds(struct ucred *creds) + { +- if ((creds->pid == current->tgid || capable(CAP_SYS_ADMIN)) && ++ if ((creds->pid == virt_tgid(current) || ++ creds->pid == current->tgid || ++ capable(CAP_VE_SYS_ADMIN)) && + ((creds->uid == current->uid || creds->uid == current->euid || + creds->uid == current->suid) || capable(CAP_SETUID)) && + ((creds->gid == current->gid || creds->gid == current->egid || +@@ -68,7 +71,7 @@ static int scm_fp_copy(struct cmsghdr *c + + if (!fpl) + { +- fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); ++ fpl = ub_kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + if (!fpl) + return -ENOMEM; + *fplp = fpl; +@@ -274,7 +277,7 @@ struct scm_fp_list *scm_fp_dup(struct sc + if (!fpl) + return NULL; + +- new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); ++ new_fpl = ub_kmalloc(sizeof(*fpl), GFP_KERNEL); + if (new_fpl) { + for (i=fpl->count-1; i>=0; i--) + get_file(fpl->fp[i]); +diff -uprN linux-2.6.15.orig/net/core/skbuff.c linux-2.6.15-ve025stab014/net/core/skbuff.c +--- linux-2.6.15.orig/net/core/skbuff.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/core/skbuff.c 2006-01-27 14:48:08.000000000 +0300 +@@ -48,6 +48,7 @@ + #include <linux/in.h> + #include <linux/inet.h> + #include <linux/slab.h> ++#include <linux/kmem_cache.h> + #include <linux/netdevice.h> + #ifdef CONFIG_NET_CLS_ACT + #include <net/pkt_sched.h> +@@ -68,6 +69,8 @@ + #include <asm/uaccess.h> + #include <asm/system.h> + ++#include <ub/ub_net.h> ++ + static kmem_cache_t *skbuff_head_cache __read_mostly; + static kmem_cache_t *skbuff_fclone_cache __read_mostly; + +@@ -149,6 +152,9 @@ struct sk_buff *__alloc_skb(unsigned int + if (!skb) + goto out; + ++ if (ub_skb_alloc_bc(skb, gfp_mask & ~__GFP_DMA)) ++ goto nobc; ++ + /* Get the DATA. Size must match skb_add_mtu(). */ + size = SKB_DATA_ALIGN(size); + data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); +@@ -162,6 +168,7 @@ struct sk_buff *__alloc_skb(unsigned int + skb->data = data; + skb->tail = data; + skb->end = data + size; ++ SET_VE_OWNER_SKB(skb, get_exec_env()); + if (fclone) { + struct sk_buff *child = skb + 1; + atomic_t *fclone_ref = (atomic_t *) (child + 1); +@@ -181,6 +188,8 @@ struct sk_buff *__alloc_skb(unsigned int + out: + return skb; + nodata: ++ ub_skb_free_bc(skb); ++nobc: + kmem_cache_free(skbuff_head_cache, skb); + skb = NULL; + goto out; +@@ -213,6 +222,9 @@ struct sk_buff *alloc_skb_from_cache(kme + if (!skb) + goto out; + ++ if (ub_skb_alloc_bc(skb, gfp_mask & ~__GFP_DMA)) ++ goto nobc; ++ + /* Get the DATA. */ + size = SKB_DATA_ALIGN(size); + data = kmem_cache_alloc(cp, gfp_mask); +@@ -226,6 +238,7 @@ struct sk_buff *alloc_skb_from_cache(kme + skb->data = data; + skb->tail = data; + skb->end = data + size; ++ SET_VE_OWNER_SKB(skb, get_exec_env()); + + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; +@@ -235,6 +248,8 @@ struct sk_buff *alloc_skb_from_cache(kme + out: + return skb; + nodata: ++ ub_skb_free_bc(skb); ++nobc: + kmem_cache_free(skbuff_head_cache, skb); + skb = NULL; + goto out; +@@ -289,6 +304,7 @@ void kfree_skbmem(struct sk_buff *skb) + atomic_t *fclone_ref; + + skb_release_data(skb); ++ ub_skb_free_bc(skb); + switch (skb->fclone) { + case SKB_FCLONE_UNAVAILABLE: + kmem_cache_free(skbuff_head_cache, skb); +@@ -330,6 +346,7 @@ void __kfree_skb(struct sk_buff *skb) + #ifdef CONFIG_XFRM + secpath_put(skb->sp); + #endif ++ ub_skb_uncharge(skb); + if (skb->destructor) { + WARN_ON(in_irq()); + skb->destructor(skb); +@@ -385,6 +402,11 @@ struct sk_buff *skb_clone(struct sk_buff + n->fclone = SKB_FCLONE_UNAVAILABLE; + } + ++ if (ub_skb_alloc_bc(n, gfp_mask)) { ++ kmem_cache_free(skbuff_head_cache, n); ++ return NULL; ++ } ++ + #define C(x) n->x = skb->x + + n->next = n->prev = NULL; +@@ -411,6 +433,7 @@ struct sk_buff *skb_clone(struct sk_buff + C(ip_summed); + C(priority); + C(protocol); ++ SET_VE_OWNER_SKB(n, VE_OWNER_SKB(skb)); + n->destructor = NULL; + #ifdef CONFIG_NETFILTER + C(nfmark); +diff -uprN linux-2.6.15.orig/net/core/sock.c linux-2.6.15-ve025stab014/net/core/sock.c +--- linux-2.6.15.orig/net/core/sock.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/core/sock.c 2006-01-27 14:48:08.000000000 +0300 +@@ -107,6 +107,7 @@ + #include <linux/net.h> + #include <linux/mm.h> + #include <linux/slab.h> ++#include <linux/kmem_cache.h> + #include <linux/interrupt.h> + #include <linux/poll.h> + #include <linux/tcp.h> +@@ -123,6 +124,9 @@ + #include <net/xfrm.h> + #include <linux/ipsec.h> + ++#include <ub/ub_net.h> ++#include <ub/beancounter.h> ++ + #include <linux/filter.h> + + #ifdef CONFIG_INET +@@ -171,7 +175,7 @@ static void sock_warn_obsolete_bsdism(co + static char warncomm[TASK_COMM_LEN]; + if (strcmp(warncomm, current->comm) && warned < 5) { + strcpy(warncomm, current->comm); +- printk(KERN_WARNING "process `%s' is using obsolete " ++ ve_printk(VE_LOG, KERN_WARNING "process `%s' is using obsolete " + "%s SO_BSDCOMPAT\n", warncomm, name); + warned++; + } +@@ -658,6 +662,7 @@ struct sock *sk_alloc(int family, gfp_t + */ + sk->sk_prot = sk->sk_prot_creator = prot; + sock_lock_init(sk); ++ SET_VE_OWNER_SK(sk, get_exec_env()); + } + + if (security_sk_alloc(sk, family, priority)) +@@ -697,6 +702,7 @@ void sk_free(struct sock *sk) + __FUNCTION__, atomic_read(&sk->sk_omem_alloc)); + + security_sk_free(sk); ++ ub_sock_uncharge(sk); + if (sk->sk_prot_creator->slab != NULL) + kmem_cache_free(sk->sk_prot_creator->slab, sk); + else +@@ -713,6 +719,11 @@ struct sock *sk_clone(const struct sock + + memcpy(newsk, sk, sk->sk_prot->obj_size); + ++ if (ub_sock_charge(newsk, sk->sk_family, sk->sk_type) < 0) { ++ sk_free(newsk); ++ return NULL; ++ } ++ + /* SANITY */ + sk_node_init(&newsk->sk_node); + sock_lock_init(newsk); +@@ -933,14 +944,12 @@ static long sock_wait_for_wmem(struct so + /* + * Generic send/receive buffer handlers + */ +- +-static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, +- unsigned long header_len, +- unsigned long data_len, +- int noblock, int *errcode) ++struct sk_buff *sock_alloc_send_skb2(struct sock *sk, unsigned long size, ++ unsigned long size2, int noblock, ++ int *errcode) + { + struct sk_buff *skb; +- gfp_t gfp_mask; ++ unsigned int gfp_mask; + long timeo; + int err; + +@@ -958,46 +967,35 @@ static struct sk_buff *sock_alloc_send_p + if (sk->sk_shutdown & SEND_SHUTDOWN) + goto failure; + +- if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { +- skb = alloc_skb(header_len, sk->sk_allocation); +- if (skb) { +- int npages; +- int i; +- +- /* No pages, we're done... */ +- if (!data_len) +- break; +- +- npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; +- skb->truesize += data_len; +- skb_shinfo(skb)->nr_frags = npages; +- for (i = 0; i < npages; i++) { +- struct page *page; +- skb_frag_t *frag; +- +- page = alloc_pages(sk->sk_allocation, 0); +- if (!page) { +- err = -ENOBUFS; +- skb_shinfo(skb)->nr_frags = i; +- kfree_skb(skb); +- goto failure; +- } +- +- frag = &skb_shinfo(skb)->frags[i]; +- frag->page = page; +- frag->page_offset = 0; +- frag->size = (data_len >= PAGE_SIZE ? +- PAGE_SIZE : +- data_len); +- data_len -= PAGE_SIZE; +- } ++ if (ub_sock_getwres_other(sk, skb_charge_size(size))) { ++ if (size2 < size) { ++ size = size2; ++ continue; ++ } ++ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); ++ err = -EAGAIN; ++ if (!timeo) ++ goto failure; ++ if (signal_pending(current)) ++ goto interrupted; ++ timeo = ub_sock_wait_for_space(sk, timeo, ++ skb_charge_size(size)); ++ continue; ++ } + ++ if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { ++ skb = alloc_skb(size, sk->sk_allocation); ++ if (skb) + /* Full success... */ + break; +- } ++ ub_sock_retwres_other(sk, skb_charge_size(size), ++ SOCK_MIN_UBCSPACE_CH); + err = -ENOBUFS; + goto failure; + } ++ ub_sock_retwres_other(sk, ++ skb_charge_size(size), ++ SOCK_MIN_UBCSPACE_CH); + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; +@@ -1008,6 +1006,7 @@ static struct sk_buff *sock_alloc_send_p + timeo = sock_wait_for_wmem(sk, timeo); + } + ++ ub_skb_set_charge(skb, sk, skb_charge_size(size), UB_OTHERSOCKBUF); + skb_set_owner_w(skb, sk); + return skb; + +@@ -1021,7 +1020,7 @@ failure: + struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, + int noblock, int *errcode) + { +- return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); ++ return sock_alloc_send_skb2(sk, size, size, noblock, errcode); + } + + static void __lock_sock(struct sock *sk) +@@ -1461,7 +1460,8 @@ int proto_register(struct proto *prot, i + + if (alloc_slab) { + prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, +- SLAB_HWCACHE_ALIGN, NULL, NULL); ++ SLAB_HWCACHE_ALIGN | SLAB_UBC, ++ NULL, NULL); + + if (prot->slab == NULL) { + printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", +@@ -1477,9 +1477,11 @@ int proto_register(struct proto *prot, i + goto out_free_sock_slab; + + sprintf(request_sock_slab_name, mask, prot->name); +- prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name, +- prot->rsk_prot->obj_size, 0, +- SLAB_HWCACHE_ALIGN, NULL, NULL); ++ prot->rsk_prot->slab = ++ kmem_cache_create(request_sock_slab_name, ++ prot->rsk_prot->obj_size, 0, ++ SLAB_HWCACHE_ALIGN | SLAB_UBC, ++ NULL, NULL); + + if (prot->rsk_prot->slab == NULL) { + printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n", +@@ -1497,10 +1499,11 @@ int proto_register(struct proto *prot, i + goto out_free_request_sock_slab; + + sprintf(timewait_sock_slab_name, mask, prot->name); +- prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name, +- prot->twsk_obj_size, +- 0, SLAB_HWCACHE_ALIGN, +- NULL, NULL); ++ prot->twsk_slab = ++ kmem_cache_create(timewait_sock_slab_name, ++ prot->twsk_obj_size, 0, ++ SLAB_HWCACHE_ALIGN | SLAB_UBC, ++ NULL, NULL); + if (prot->twsk_slab == NULL) + goto out_free_timewait_sock_slab_name; + } +diff -uprN linux-2.6.15.orig/net/core/stream.c linux-2.6.15-ve025stab014/net/core/stream.c +--- linux-2.6.15.orig/net/core/stream.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/core/stream.c 2006-01-27 14:48:06.000000000 +0300 +@@ -109,8 +109,9 @@ EXPORT_SYMBOL(sk_stream_wait_close); + * sk_stream_wait_memory - Wait for more memory for a socket + * @sk: socket to wait for memory + * @timeo_p: for how long ++ * @amount - amount of memory to wait for (in UB space!) + */ +-int sk_stream_wait_memory(struct sock *sk, long *timeo_p) ++int sk_stream_wait_memory(struct sock *sk, long *timeo_p, unsigned long amount) + { + int err = 0; + long vm_wait = 0; +@@ -132,14 +133,19 @@ int sk_stream_wait_memory(struct sock *s + if (signal_pending(current)) + goto do_interrupted; + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); +- if (sk_stream_memory_free(sk) && !vm_wait) +- break; ++ if (amount == 0) { ++ if (sk_stream_memory_free(sk) && !vm_wait) ++ break; ++ } else ++ ub_sock_sndqueueadd_tcp(sk, amount); + + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + sk->sk_write_pending++; + sk_wait_event(sk, ¤t_timeo, sk_stream_memory_free(sk) && + vm_wait); + sk->sk_write_pending--; ++ if (amount > 0) ++ ub_sock_sndqueuedel(sk); + + if (vm_wait) { + vm_wait -= current_timeo; +diff -uprN linux-2.6.15.orig/net/dccp/ipv4.c linux-2.6.15-ve025stab014/net/dccp/ipv4.c +--- linux-2.6.15.orig/net/dccp/ipv4.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/dccp/ipv4.c 2006-01-27 14:48:08.000000000 +0300 +@@ -60,11 +60,16 @@ static int __dccp_v4_check_established(s + const int dif = sk->sk_bound_dev_if; + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); +- unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); +- struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash); ++ unsigned int hash; ++ struct inet_ehash_bucket *head; + const struct sock *sk2; + const struct hlist_node *node; + struct inet_timewait_sock *tw; ++ struct ve_struct *env; ++ ++ env = VE_OWNER_SK(sk); ++ hash = inet_ehashfn(daddr, lport, saddr, inet->dport, VEID(env)); ++ head = inet_ehash_bucket(&tcp_hashinfo, hash); + + prefetch(head->chain.first); + write_lock(&head->lock); +@@ -73,14 +78,16 @@ static int __dccp_v4_check_established(s + sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) { + tw = inet_twsk(sk2); + +- if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) ++ if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ++ ports, dif, env)) + goto not_unique; + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { +- if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) ++ if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ++ ports, dif, env)) + goto not_unique; + } + +@@ -133,7 +140,8 @@ static int dccp_v4_hash_connect(struct s + local_bh_disable(); + do { + head = &dccp_hashinfo.bhash[inet_bhashfn(rover, +- dccp_hashinfo.bhash_size)]; ++ dccp_hashinfo.bhash_size, ++ 0)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, +@@ -154,7 +162,7 @@ static int dccp_v4_hash_connect(struct s + } + + tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep, +- head, rover); ++ head, rover, get_exec_env()); + if (tb == NULL) { + spin_unlock(&head->lock); + break; +@@ -191,7 +199,7 @@ ok: + } + + head = &dccp_hashinfo.bhash[inet_bhashfn(snum, +- dccp_hashinfo.bhash_size)]; ++ dccp_hashinfo.bhash_size, 0)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) { +diff -uprN linux-2.6.15.orig/net/ipv4/af_inet.c linux-2.6.15-ve025stab014/net/ipv4/af_inet.c +--- linux-2.6.15.orig/net/ipv4/af_inet.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/af_inet.c 2006-01-27 14:48:08.000000000 +0300 +@@ -112,6 +112,7 @@ + #ifdef CONFIG_IP_MROUTE + #include <linux/mroute.h> + #endif ++#include <ub/ub_net.h> + + DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly; + +@@ -296,6 +297,13 @@ lookup_protocol: + if (sk == NULL) + goto out; + ++ err = -ENOBUFS; ++ if (ub_sock_charge(sk, PF_INET, sock->type)) ++ goto out_sk_free; ++ /* if charge was successful, sock_init_data() MUST be called to ++ * set sk->sk_type. otherwise sk will be uncharged to wrong resource ++ */ ++ + err = 0; + sk->sk_no_check = answer_no_check; + if (INET_PROTOSW_REUSE & answer_flags) +@@ -352,6 +360,9 @@ out: + out_rcu_unlock: + rcu_read_unlock(); + goto out; ++out_sk_free: ++ sk_free(sk); ++ return err; + } + + +@@ -366,6 +377,9 @@ int inet_release(struct socket *sock) + + if (sk) { + long timeout; ++ struct ve_struct *saved_env; ++ ++ saved_env = set_exec_env(VE_OWNER_SK(sk)); + + /* Applications forget to leave groups before exiting */ + ip_mc_drop_socket(sk); +@@ -383,6 +397,8 @@ int inet_release(struct socket *sock) + timeout = sk->sk_lingertime; + sock->sk = NULL; + sk->sk_prot->close(sk, timeout); ++ ++ (void)set_exec_env(saved_env); + } + return 0; + } +@@ -1104,20 +1120,20 @@ static struct net_protocol icmp_protocol + + static int __init init_ipv4_mibs(void) + { +- net_statistics[0] = alloc_percpu(struct linux_mib); +- net_statistics[1] = alloc_percpu(struct linux_mib); +- ip_statistics[0] = alloc_percpu(struct ipstats_mib); +- ip_statistics[1] = alloc_percpu(struct ipstats_mib); +- icmp_statistics[0] = alloc_percpu(struct icmp_mib); +- icmp_statistics[1] = alloc_percpu(struct icmp_mib); +- tcp_statistics[0] = alloc_percpu(struct tcp_mib); +- tcp_statistics[1] = alloc_percpu(struct tcp_mib); +- udp_statistics[0] = alloc_percpu(struct udp_mib); +- udp_statistics[1] = alloc_percpu(struct udp_mib); ++ ve_net_statistics[0] = alloc_percpu(struct linux_mib); ++ ve_net_statistics[1] = alloc_percpu(struct linux_mib); ++ ve_ip_statistics[0] = alloc_percpu(struct ipstats_mib); ++ ve_ip_statistics[1] = alloc_percpu(struct ipstats_mib); ++ ve_icmp_statistics[0] = alloc_percpu(struct icmp_mib); ++ ve_icmp_statistics[1] = alloc_percpu(struct icmp_mib); ++ ve_tcp_statistics[0] = alloc_percpu(struct tcp_mib); ++ ve_tcp_statistics[1] = alloc_percpu(struct tcp_mib); ++ ve_udp_statistics[0] = alloc_percpu(struct udp_mib); ++ ve_udp_statistics[1] = alloc_percpu(struct udp_mib); + if (! +- (net_statistics[0] && net_statistics[1] && ip_statistics[0] +- && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1] +- && udp_statistics[0] && udp_statistics[1])) ++ (ve_net_statistics[0] && ve_net_statistics[1] && ve_ip_statistics[0] ++ && ve_ip_statistics[1] && ve_tcp_statistics[0] && ve_tcp_statistics[1] ++ && ve_udp_statistics[0] && ve_udp_statistics[1])) + return -ENOMEM; + + (void) tcp_mib_init(); +diff -uprN linux-2.6.15.orig/net/ipv4/arp.c linux-2.6.15-ve025stab014/net/ipv4/arp.c +--- linux-2.6.15.orig/net/ipv4/arp.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/arp.c 2006-01-27 14:48:08.000000000 +0300 +@@ -986,7 +986,7 @@ static int arp_req_set(struct arpreq *r, + return 0; + } + if (dev == NULL) { +- ipv4_devconf.proxy_arp = 1; ++ ve_ipv4_devconf.proxy_arp = 1; + return 0; + } + if (__in_dev_get_rtnl(dev)) { +@@ -1092,7 +1092,7 @@ static int arp_req_delete(struct arpreq + return pneigh_delete(&arp_tbl, &ip, dev); + if (mask == 0) { + if (dev == NULL) { +- ipv4_devconf.proxy_arp = 0; ++ ve_ipv4_devconf.proxy_arp = 0; + return 0; + } + if (__in_dev_get_rtnl(dev)) { +@@ -1143,6 +1143,8 @@ int arp_ioctl(unsigned int cmd, void __u + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + case SIOCGARP: ++ if (!ve_is_super(get_exec_env())) ++ return -EACCES; + err = copy_from_user(&r, arg, sizeof(struct arpreq)); + if (err) + return -EFAULT; +@@ -1370,8 +1372,12 @@ static int arp_seq_open(struct inode *in + { + struct seq_file *seq; + int rc = -ENOMEM; +- struct neigh_seq_state *s = kmalloc(sizeof(*s), GFP_KERNEL); +- ++ struct neigh_seq_state *s; ++ ++ if (!ve_is_super(get_exec_env())) ++ return -EPERM; ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + goto out; + +@@ -1399,7 +1405,7 @@ static struct file_operations arp_seq_fo + + static int __init arp_proc_init(void) + { +- if (!proc_net_fops_create("arp", S_IRUGO, &arp_seq_fops)) ++ if (!proc_glob_fops_create("net/arp", S_IRUGO, &arp_seq_fops)) + return -ENOMEM; + return 0; + } +diff -uprN linux-2.6.15.orig/net/ipv4/devinet.c linux-2.6.15-ve025stab014/net/ipv4/devinet.c +--- linux-2.6.15.orig/net/ipv4/devinet.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/devinet.c 2006-01-27 14:48:08.000000000 +0300 +@@ -69,7 +69,7 @@ struct ipv4_devconf ipv4_devconf = { + .shared_media = 1, + }; + +-static struct ipv4_devconf ipv4_devconf_dflt = { ++struct ipv4_devconf ipv4_devconf_dflt = { + .accept_redirects = 1, + .send_redirects = 1, + .secure_redirects = 1, +@@ -77,10 +77,16 @@ static struct ipv4_devconf ipv4_devconf_ + .accept_source_route = 1, + }; + ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ve_ipv4_devconf_dflt (*(get_exec_env()->_ipv4_devconf_dflt)) ++#else ++#define ve_ipv4_devconf_dflt ipv4_devconf_dflt ++#endif ++ + static void rtmsg_ifa(int event, struct in_ifaddr *); + + static struct notifier_block *inetaddr_chain; +-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, ++void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, + int destroy); + #ifdef CONFIG_SYSCTL + static void devinet_sysctl_register(struct in_device *in_dev, +@@ -230,7 +236,7 @@ int inet_addr_onlink(struct in_device *i + return 0; + } + +-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, ++void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, + int destroy) + { + struct in_ifaddr *promote = NULL; +@@ -576,7 +582,7 @@ int devinet_ioctl(unsigned int cmd, void + + case SIOCSIFFLAGS: + ret = -EACCES; +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + goto out; + break; + case SIOCSIFADDR: /* Set interface address (and family) */ +@@ -584,7 +590,7 @@ int devinet_ioctl(unsigned int cmd, void + case SIOCSIFDSTADDR: /* Set the destination address */ + case SIOCSIFNETMASK: /* Set the netmask for the interface */ + ret = -EACCES; +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + goto out; + ret = -EINVAL; + if (sin->sin_family != AF_INET) +@@ -1001,7 +1007,7 @@ static int inetdev_event(struct notifier + case NETDEV_UP: + if (dev->mtu < 68) + break; +- if (dev == &loopback_dev) { ++ if (dev == &visible_loopback_dev) { + struct in_ifaddr *ifa; + if ((ifa = inet_alloc_ifa()) != NULL) { + ifa->ifa_local = +@@ -1161,10 +1167,10 @@ static struct rtnetlink_link inet_rtnetl + void inet_forward_change(void) + { + struct net_device *dev; +- int on = ipv4_devconf.forwarding; ++ int on = ve_ipv4_devconf.forwarding; + +- ipv4_devconf.accept_redirects = !on; +- ipv4_devconf_dflt.forwarding = on; ++ ve_ipv4_devconf.accept_redirects = !on; ++ ve_ipv4_devconf_dflt.forwarding = on; + + read_lock(&dev_base_lock); + for (dev = dev_base; dev; dev = dev->next) { +@@ -1189,9 +1195,9 @@ static int devinet_sysctl_forward(ctl_ta + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + + if (write && *valp != val) { +- if (valp == &ipv4_devconf.forwarding) ++ if (valp == &ve_ipv4_devconf.forwarding) + inet_forward_change(); +- else if (valp != &ipv4_devconf_dflt.forwarding) ++ else if (valp != &ve_ipv4_devconf_dflt.forwarding) + rt_cache_flush(0); + } + +@@ -1462,30 +1468,22 @@ static struct devinet_sysctl_table { + }, + }; + +-static void devinet_sysctl_register(struct in_device *in_dev, +- struct ipv4_devconf *p) ++static struct devinet_sysctl_table *__devinet_sysctl_register(char *dev_name, ++ int ifindex, struct ipv4_devconf *p) + { + int i; +- struct net_device *dev = in_dev ? in_dev->dev : NULL; +- struct devinet_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL); +- char *dev_name = NULL; ++ struct devinet_sysctl_table *t; + ++ t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) +- return; ++ goto out; ++ + memcpy(t, &devinet_sysctl, sizeof(*t)); + for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) { + t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; + t->devinet_vars[i].de = NULL; + } + +- if (dev) { +- dev_name = dev->name; +- t->devinet_dev[0].ctl_name = dev->ifindex; +- } else { +- dev_name = "default"; +- t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; +- } +- + /* + * Make a copy of dev_name, because '.procname' is regarded as const + * by sysctl and we wouldn't want anyone to change it under our feet +@@ -1493,8 +1491,9 @@ static void devinet_sysctl_register(stru + */ + dev_name = kstrdup(dev_name, GFP_KERNEL); + if (!dev_name) +- goto free; ++ goto out_free_table; + ++ t->devinet_dev[0].ctl_name = ifindex; + t->devinet_dev[0].procname = dev_name; + t->devinet_dev[0].child = t->devinet_vars; + t->devinet_dev[0].de = NULL; +@@ -1507,17 +1506,38 @@ static void devinet_sysctl_register(stru + + t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0); + if (!t->sysctl_header) +- goto free_procname; ++ goto out_free_procname; + +- p->sysctl = t; +- return; ++ return t; + + /* error path */ +- free_procname: ++out_free_procname: + kfree(dev_name); +- free: ++out_free_table: + kfree(t); +- return; ++out: ++ printk(KERN_DEBUG "Can't register net/ipv4/conf sysctls.\n"); ++ return NULL; ++} ++ ++static void devinet_sysctl_register(struct in_device *in_dev, ++ struct ipv4_devconf *p) ++{ ++ struct net_device *dev; ++ char *dev_name; ++ int ifindex; ++ ++ dev = in_dev ? in_dev->dev : NULL; ++ ++ if (dev) { ++ dev_name = dev->name; ++ ifindex = dev->ifindex; ++ } else { ++ dev_name = "default"; ++ ifindex = NET_PROTO_CONF_DEFAULT; ++ } ++ ++ p->sysctl = __devinet_sysctl_register(dev_name, ifindex, p); + } + + static void devinet_sysctl_unregister(struct ipv4_devconf *p) +@@ -1530,7 +1550,170 @@ static void devinet_sysctl_unregister(st + kfree(t); + } + } ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++static ctl_table net_sysctl_tables[] = { ++ /* 0: net */ ++ { ++ .ctl_name = CTL_NET, ++ .procname = "net", ++ .mode = 0555, ++ .child = &net_sysctl_tables[2], ++ }, ++ { .ctl_name = 0, }, ++ /* 2: net/ipv4 */ ++ { ++ .ctl_name = NET_IPV4, ++ .procname = "ipv4", ++ .mode = 0555, ++ .child = &net_sysctl_tables[4], ++ }, ++ { .ctl_name = 0, }, ++ /* 4, 5: net/ipv4/[vars] */ ++ { ++ .ctl_name = NET_IPV4_FORWARD, ++ .procname = "ip_forward", ++ .data = &ipv4_devconf.forwarding, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &ipv4_sysctl_forward, ++ .strategy = &ipv4_sysctl_forward_strategy, ++ }, ++ { ++ .ctl_name = NET_IPV4_ROUTE, ++ .procname = "route", ++ .maxlen = 0, ++ .mode = 0555, ++ .child = &net_sysctl_tables[7], ++ }, ++ { .ctl_name = 0 }, ++ /* 7: net/ipv4/route/flush */ ++ { ++ .ctl_name = NET_IPV4_ROUTE_FLUSH, ++ .procname = "flush", ++ .data = NULL, /* setuped below */ ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &ipv4_sysctl_rtcache_flush, ++ .strategy = &ipv4_sysctl_rtcache_flush_strategy, ++ }, ++ { .ctl_name = 0 }, ++}; ++ ++static int ip_forward_sysctl_register(struct ve_struct *ve, ++ struct ipv4_devconf *p) ++{ ++ struct ctl_table_header *hdr; ++ ctl_table *root; ++ ++ root = clone_sysctl_template(net_sysctl_tables, ++ sizeof(net_sysctl_tables) / sizeof(ctl_table)); ++ if (root == NULL) ++ goto out; ++ ++ root[4].data = &p->forwarding; ++ root[7].data = &ipv4_flush_delay; ++ ++ hdr = register_sysctl_table(root, 1); ++ if (hdr == NULL) ++ goto out_free; ++ ++ ve->forward_header = hdr; ++ ve->forward_table = root; ++ return 0; ++ ++out_free: ++ free_sysctl_clone(root); ++out: ++ return -ENOMEM; ++} ++ ++static inline void ip_forward_sysctl_unregister(struct ve_struct *ve) ++{ ++ unregister_sysctl_table(ve->forward_header); ++ ve->forward_header = NULL; ++} ++ ++static inline void ip_forward_sysctl_free(struct ve_struct *ve) ++{ ++ free_sysctl_clone(ve->forward_table); ++ ve->forward_table = NULL; ++} ++#endif ++#endif ++ ++int devinet_sysctl_init(struct ve_struct *ve) ++{ ++ int err = 0; ++#ifdef CONFIG_SYSCTL ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ struct ipv4_devconf *conf, *conf_def; ++ ++ err = -ENOMEM; ++ ++ conf = kmalloc(sizeof(*conf), GFP_KERNEL); ++ if (!conf) ++ goto err1; ++ ++ memcpy(conf, &ipv4_devconf, sizeof(*conf)); ++ conf->sysctl = __devinet_sysctl_register("all", ++ NET_PROTO_CONF_ALL, conf); ++ if (!conf->sysctl) ++ goto err2; ++ ++ conf_def = kmalloc(sizeof(*conf_def), GFP_KERNEL); ++ if (!conf_def) ++ goto err3; ++ ++ memcpy(conf_def, &ipv4_devconf_dflt, sizeof(*conf_def)); ++ conf_def->sysctl = __devinet_sysctl_register("default", ++ NET_PROTO_CONF_DEFAULT, conf_def); ++ if (!conf_def->sysctl) ++ goto err4; ++ ++ err = ip_forward_sysctl_register(ve, conf); ++ if (err) ++ goto err5; ++ ++ ve->_ipv4_devconf = conf; ++ ve->_ipv4_devconf_dflt = conf_def; ++ return 0; ++ ++err5: ++ devinet_sysctl_unregister(conf_def); ++err4: ++ kfree(conf_def); ++err3: ++ devinet_sysctl_unregister(conf); ++err2: ++ kfree(conf); ++err1: + #endif ++#endif ++ return err; ++} ++ ++void devinet_sysctl_fini(struct ve_struct *ve) ++{ ++#ifdef CONFIG_SYSCTL ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ ip_forward_sysctl_unregister(ve); ++ devinet_sysctl_unregister(ve->_ipv4_devconf); ++ devinet_sysctl_unregister(ve->_ipv4_devconf_dflt); ++#endif ++#endif ++} ++ ++void devinet_sysctl_free(struct ve_struct *ve) ++{ ++#ifdef CONFIG_SYSCTL ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ ip_forward_sysctl_free(ve); ++ kfree(ve->_ipv4_devconf); ++ kfree(ve->_ipv4_devconf_dflt); ++#endif ++#endif ++} + + void __init devinet_init(void) + { +@@ -1540,13 +1723,18 @@ void __init devinet_init(void) + #ifdef CONFIG_SYSCTL + devinet_sysctl.sysctl_header = + register_sysctl_table(devinet_sysctl.devinet_root_dir, 0); +- devinet_sysctl_register(NULL, &ipv4_devconf_dflt); ++ __devinet_sysctl_register("default", NET_PROTO_CONF_DEFAULT, ++ &ipv4_devconf_dflt); + #endif + } + + EXPORT_SYMBOL(devinet_ioctl); + EXPORT_SYMBOL(in_dev_finish_destroy); + EXPORT_SYMBOL(inet_select_addr); ++EXPORT_SYMBOL(inet_del_ifa); + EXPORT_SYMBOL(inetdev_by_index); ++EXPORT_SYMBOL(devinet_sysctl_init); ++EXPORT_SYMBOL(devinet_sysctl_fini); ++EXPORT_SYMBOL(devinet_sysctl_free); + EXPORT_SYMBOL(register_inetaddr_notifier); + EXPORT_SYMBOL(unregister_inetaddr_notifier); +diff -uprN linux-2.6.15.orig/net/ipv4/fib_frontend.c linux-2.6.15-ve025stab014/net/ipv4/fib_frontend.c +--- linux-2.6.15.orig/net/ipv4/fib_frontend.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/fib_frontend.c 2006-01-27 14:48:08.000000000 +0300 +@@ -51,14 +51,46 @@ + + #define RT_TABLE_MIN RT_TABLE_MAIN + ++#undef ip_fib_local_table ++#undef ip_fib_main_table + struct fib_table *ip_fib_local_table; + struct fib_table *ip_fib_main_table; ++void prepare_fib_tables(void) ++{ ++#ifdef CONFIG_VE ++ get_ve0()->_local_table = ip_fib_local_table; ++ ip_fib_local_table = (struct fib_table *)0x12345678; ++ get_ve0()->_main_table = ip_fib_main_table; ++ ip_fib_main_table = (struct fib_table *)0x12345678; ++#endif ++} ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ip_fib_local_table get_exec_env()->_local_table ++#define ip_fib_main_table get_exec_env()->_main_table ++#endif + + #else + + #define RT_TABLE_MIN 1 + ++#undef fib_tables + struct fib_table *fib_tables[RT_TABLE_MAX+1]; ++void prepare_fib_tables(void) ++{ ++#ifdef CONFIG_VE ++ int i; ++ ++ BUG_ON(sizeof(fib_tables) != ++ sizeof(((struct ve_struct *)0)->_fib_tables)); ++ memcpy(get_ve0()->_fib_tables, fib_tables, sizeof(fib_tables)); ++ for (i = 0; i <= RT_TABLE_MAX; i++) ++ fib_tables[i] = (void *)0x12366678; ++#endif ++} ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define fib_tables get_exec_env()->_fib_tables ++#endif + + struct fib_table *__fib_new_table(int id) + { +@@ -248,7 +280,7 @@ int ip_rt_ioctl(unsigned int cmd, void _ + switch (cmd) { + case SIOCADDRT: /* Add a route */ + case SIOCDELRT: /* Delete a route */ +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + return -EPERM; + if (copy_from_user(&r, arg, sizeof(struct rtentry))) + return -EFAULT; +@@ -651,6 +683,7 @@ static struct notifier_block fib_netdev_ + + void __init ip_fib_init(void) + { ++ prepare_fib_tables(); + #ifndef CONFIG_IP_MULTIPLE_TABLES + ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL); + ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN); +diff -uprN linux-2.6.15.orig/net/ipv4/fib_hash.c linux-2.6.15-ve025stab014/net/ipv4/fib_hash.c +--- linux-2.6.15.orig/net/ipv4/fib_hash.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/fib_hash.c 2006-01-27 14:48:08.000000000 +0300 +@@ -35,6 +35,7 @@ + #include <linux/skbuff.h> + #include <linux/netlink.h> + #include <linux/init.h> ++#include <linux/ve.h> + + #include <net/ip.h> + #include <net/protocol.h> +@@ -72,11 +73,6 @@ struct fn_zone { + * can be cheaper than memory lookup, so that FZ_* macros are used. + */ + +-struct fn_hash { +- struct fn_zone *fn_zones[33]; +- struct fn_zone *fn_zone_list; +-}; +- + static inline u32 fn_hash(u32 key, struct fn_zone *fz) + { + u32 h = ntohl(key)>>(32 - fz->fz_order); +@@ -622,7 +618,7 @@ fn_hash_delete(struct fib_table *tb, str + return -ESRCH; + } + +-static int fn_flush_list(struct fn_zone *fz, int idx) ++static int fn_flush_list(struct fn_zone *fz, int idx, int destroy) + { + struct hlist_head *head = &fz->fz_hash[idx]; + struct hlist_node *node, *n; +@@ -637,7 +633,9 @@ static int fn_flush_list(struct fn_zone + list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) { + struct fib_info *fi = fa->fa_info; + +- if (fi && (fi->fib_flags&RTNH_F_DEAD)) { ++ if (fi == NULL) ++ continue; ++ if (destroy || (fi->fib_flags&RTNH_F_DEAD)) { + write_lock_bh(&fib_hash_lock); + list_del(&fa->fa_list); + if (list_empty(&f->fn_alias)) { +@@ -659,7 +657,7 @@ static int fn_flush_list(struct fn_zone + return found; + } + +-static int fn_hash_flush(struct fib_table *tb) ++static int __fn_hash_flush(struct fib_table *tb, int destroy) + { + struct fn_hash *table = (struct fn_hash *) tb->tb_data; + struct fn_zone *fz; +@@ -669,11 +667,84 @@ static int fn_hash_flush(struct fib_tabl + int i; + + for (i = fz->fz_divisor - 1; i >= 0; i--) +- found += fn_flush_list(fz, i); ++ found += fn_flush_list(fz, i, destroy); + } + return found; + } + ++static int fn_hash_flush(struct fib_table *tb) ++{ ++ return __fn_hash_flush(tb, 0); ++} ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++void fib_hash_destroy(struct fib_table *tb) ++{ ++ __fn_hash_flush(tb, 1); ++ kfree(tb); ++} ++ ++/* ++ * Initialization of virtualized networking subsystem. ++ */ ++int init_ve_route(struct ve_struct *ve) ++{ ++#ifdef CONFIG_IP_MULTIPLE_TABLES ++ if (fib_rules_create()) ++ return -ENOMEM; ++ ve->_fib_tables[RT_TABLE_LOCAL] = fib_hash_init(RT_TABLE_LOCAL); ++ if (!ve->_fib_tables[RT_TABLE_LOCAL]) ++ goto out_destroy; ++ ve->_fib_tables[RT_TABLE_MAIN] = fib_hash_init(RT_TABLE_MAIN); ++ if (!ve->_fib_tables[RT_TABLE_MAIN]) ++ goto out_destroy_local; ++ ++ return 0; ++ ++out_destroy_local: ++ fib_hash_destroy(ve->_fib_tables[RT_TABLE_LOCAL]); ++out_destroy: ++ fib_rules_destroy(); ++ ve->_local_rule = NULL; ++ return -ENOMEM; ++#else ++ ve->_local_table = fib_hash_init(RT_TABLE_LOCAL); ++ if (!ve->_local_table) ++ return -ENOMEM; ++ ve->_main_table = fib_hash_init(RT_TABLE_MAIN); ++ if (!ve->_main_table) { ++ fib_hash_destroy(ve->_local_table); ++ return -ENOMEM; ++ } ++ return 0; ++#endif ++} ++ ++void fini_ve_route(struct ve_struct *ve) ++{ ++#ifdef CONFIG_IP_MULTIPLE_TABLES ++ int i; ++ for (i=0; i<RT_TABLE_MAX+1; i++) ++ { ++ if (!ve->_fib_tables[i]) ++ continue; ++ fib_hash_destroy(ve->_fib_tables[i]); ++ } ++ fib_rules_destroy(); ++ ve->_local_rule = NULL; ++#else ++ fib_hash_destroy(ve->_local_table); ++ fib_hash_destroy(ve->_main_table); ++#endif ++ fib_hash_free(ve->_fib_info_hash, ve->_fib_hash_size); ++ fib_hash_free(ve->_fib_info_laddrhash, ve->_fib_hash_size); ++ ve->_fib_info_hash = ve->_fib_info_laddrhash = NULL; ++} ++ ++EXPORT_SYMBOL(init_ve_route); ++EXPORT_SYMBOL(fini_ve_route); ++#endif ++ + + static inline int + fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, +@@ -765,7 +836,7 @@ static int fn_hash_dump(struct fib_table + return skb->len; + } + +-#ifdef CONFIG_IP_MULTIPLE_TABLES ++#if defined(CONFIG_IP_MULTIPLE_TABLES) || defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) + struct fib_table * fib_hash_init(int id) + #else + struct fib_table * __init fib_hash_init(int id) +@@ -1075,13 +1146,13 @@ static struct file_operations fib_seq_fo + + int __init fib_proc_init(void) + { +- if (!proc_net_fops_create("route", S_IRUGO, &fib_seq_fops)) ++ if (!proc_glob_fops_create("net/route", S_IRUGO, &fib_seq_fops)) + return -ENOMEM; + return 0; + } + + void __init fib_proc_exit(void) + { +- proc_net_remove("route"); ++ remove_proc_glob_entry("net/route", NULL); + } + #endif /* CONFIG_PROC_FS */ +diff -uprN linux-2.6.15.orig/net/ipv4/fib_lookup.h linux-2.6.15-ve025stab014/net/ipv4/fib_lookup.h +--- linux-2.6.15.orig/net/ipv4/fib_lookup.h 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/fib_lookup.h 2006-01-27 14:48:08.000000000 +0300 +@@ -41,5 +41,6 @@ extern struct fib_alias *fib_find_alias( + extern int fib_detect_death(struct fib_info *fi, int order, + struct fib_info **last_resort, + int *last_idx, int *dflt); ++void fib_hash_free(struct hlist_head *hash, int bytes); + + #endif /* _FIB_LOOKUP_H */ +diff -uprN linux-2.6.15.orig/net/ipv4/fib_rules.c linux-2.6.15-ve025stab014/net/ipv4/fib_rules.c +--- linux-2.6.15.orig/net/ipv4/fib_rules.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/fib_rules.c 2006-01-27 14:48:08.000000000 +0300 +@@ -38,6 +38,7 @@ + #include <linux/proc_fs.h> + #include <linux/skbuff.h> + #include <linux/netlink.h> ++#include <linux/rtnetlink.h> + #include <linux/init.h> + + #include <net/ip.h> +@@ -98,9 +99,87 @@ static struct fib_rule local_rule = { + .r_action = RTN_UNICAST, + }; + +-static struct fib_rule *fib_rules = &local_rule; + static DEFINE_RWLOCK(fib_rules_lock); + ++void __init prepare_fib_rules(void) ++{ ++#ifdef CONFIG_VE ++ get_ve0()->_local_rule = &local_rule; ++ get_ve0()->_fib_rules = &local_rule; ++#endif ++} ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define local_rule (*(get_exec_env()->_local_rule)) ++#define fib_rules (get_exec_env()->_fib_rules) ++#else ++static struct fib_rule *fib_rules = &local_rule; ++#endif ++ ++#if defined(CONFIG_VE_CALLS) || defined(CONFIG_VE_CALLS_MODULE) ++int fib_rules_create() ++{ ++ struct fib_rule *default_rule, *main_rule, *loc_rule; ++ ++ default_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL); ++ if (default_rule == NULL) ++ goto out_def; ++ memset(default_rule, 0, sizeof(struct fib_rule)); ++ atomic_set(&default_rule->r_clntref, 1); ++ default_rule->r_preference = 0x7FFF; ++ default_rule->r_table = RT_TABLE_DEFAULT; ++ default_rule->r_action = RTN_UNICAST; ++ ++ main_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL); ++ if (main_rule == NULL) ++ goto out_main; ++ memset(main_rule, 0, sizeof(struct fib_rule)); ++ atomic_set(&main_rule->r_clntref, 1); ++ main_rule->r_preference = 0x7FFE; ++ main_rule->r_table = RT_TABLE_MAIN; ++ main_rule->r_action = RTN_UNICAST; ++ main_rule->r_next = default_rule; ++ ++ loc_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL); ++ if (loc_rule == NULL) ++ goto out_loc; ++ memset(loc_rule, 0, sizeof(struct fib_rule)); ++ atomic_set(&loc_rule->r_clntref, 1); ++ loc_rule->r_preference = 0; ++ loc_rule->r_table = RT_TABLE_LOCAL; ++ loc_rule->r_action = RTN_UNICAST; ++ loc_rule->r_next = main_rule; ++ ++ get_exec_env()->_local_rule = loc_rule; ++ get_exec_env()->_fib_rules = loc_rule; ++ ++ return 0; ++ ++out_loc: ++ kfree(main_rule); ++out_main: ++ kfree(default_rule); ++out_def: ++ return -1; ++} ++ ++void fib_rules_destroy() ++{ ++ struct fib_rule *r; ++ ++ rtnl_lock(); ++ write_lock_bh(&fib_rules_lock); ++ while(fib_rules != NULL) { ++ r = fib_rules; ++ fib_rules = fib_rules->r_next; ++ r->r_dead = 1; ++ fib_rule_put(r); ++ } ++ write_unlock_bh(&fib_rules_lock); ++ rtnl_unlock(); ++} ++#endif ++ + int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) + { + struct rtattr **rta = arg; +@@ -434,5 +513,6 @@ int inet_dump_rules(struct sk_buff *skb, + + void __init fib_rules_init(void) + { ++ prepare_fib_rules(); + register_netdevice_notifier(&fib_rules_notifier); + } +diff -uprN linux-2.6.15.orig/net/ipv4/fib_semantics.c linux-2.6.15-ve025stab014/net/ipv4/fib_semantics.c +--- linux-2.6.15.orig/net/ipv4/fib_semantics.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/fib_semantics.c 2006-01-27 14:48:08.000000000 +0300 +@@ -32,6 +32,7 @@ + #include <linux/netdevice.h> + #include <linux/if_arp.h> + #include <linux/proc_fs.h> ++#include <linux/ve.h> + #include <linux/skbuff.h> + #include <linux/netlink.h> + #include <linux/init.h> +@@ -54,6 +55,24 @@ static struct hlist_head *fib_info_laddr + static unsigned int fib_hash_size; + static unsigned int fib_info_cnt; + ++void prepare_fib_info(void) ++{ ++#ifdef CONFIG_VE ++ get_ve0()->_fib_info_hash = fib_info_hash; ++ get_ve0()->_fib_info_laddrhash = fib_info_laddrhash; ++ get_ve0()->_fib_hash_size = fib_hash_size; ++ get_ve0()->_fib_info_cnt = fib_info_cnt; ++#endif ++} ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define fib_info_hash (get_exec_env()->_fib_info_hash) ++#define fib_info_laddrhash (get_exec_env()->_fib_info_laddrhash) ++#define fib_hash_size (get_exec_env()->_fib_hash_size) ++#define fib_info_cnt (get_exec_env()->_fib_info_cnt) ++#endif ++ ++ + #define DEVINDEX_HASHBITS 8 + #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) + static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; +@@ -233,13 +252,15 @@ static struct fib_info *fib_find_info(co + return NULL; + } + +-static inline unsigned int fib_devindex_hashfn(unsigned int val) ++static inline unsigned int fib_devindex_hashfn(unsigned int val, ++ envid_t veid) + { + unsigned int mask = DEVINDEX_HASHSIZE - 1; + + return (val ^ + (val >> DEVINDEX_HASHBITS) ^ +- (val >> (DEVINDEX_HASHBITS * 2))) & mask; ++ (val >> (DEVINDEX_HASHBITS * 2)) ^ ++ (veid ^ (veid >> 16))) & mask; + } + + /* Check, that the gateway is already configured. +@@ -255,7 +276,7 @@ int ip_fib_check_default(u32 gw, struct + + read_lock(&fib_info_lock); + +- hash = fib_devindex_hashfn(dev->ifindex); ++ hash = fib_devindex_hashfn(dev->ifindex, VEID(dev->owner_env)); + head = &fib_info_devhash[hash]; + hlist_for_each_entry(nh, node, head, nh_hash) { + if (nh->nh_dev == dev && +@@ -578,7 +599,7 @@ static struct hlist_head *fib_hash_alloc + __get_free_pages(GFP_KERNEL, get_order(bytes)); + } + +-static void fib_hash_free(struct hlist_head *hash, int bytes) ++void fib_hash_free(struct hlist_head *hash, int bytes) + { + if (!hash) + return; +@@ -835,7 +856,8 @@ link_it: + + if (!nh->nh_dev) + continue; +- hash = fib_devindex_hashfn(nh->nh_dev->ifindex); ++ hash = fib_devindex_hashfn(nh->nh_dev->ifindex, ++ VEID(nh->nh_dev->owner_env)); + head = &fib_info_devhash[hash]; + hlist_add_head(&nh->nh_hash, head); + } endfor_nexthops(fi) +@@ -1182,7 +1204,8 @@ int fib_sync_down(u32 local, struct net_ + + if (dev) { + struct fib_info *prev_fi = NULL; +- unsigned int hash = fib_devindex_hashfn(dev->ifindex); ++ unsigned int hash = fib_devindex_hashfn(dev->ifindex, ++ VEID(dev->owner_env)); + struct hlist_head *head = &fib_info_devhash[hash]; + struct hlist_node *node; + struct fib_nh *nh; +@@ -1247,7 +1270,7 @@ int fib_sync_up(struct net_device *dev) + return 0; + + prev_fi = NULL; +- hash = fib_devindex_hashfn(dev->ifindex); ++ hash = fib_devindex_hashfn(dev->ifindex, VEID(dev->owner_env)); + head = &fib_info_devhash[hash]; + ret = 0; + +diff -uprN linux-2.6.15.orig/net/ipv4/igmp.c linux-2.6.15-ve025stab014/net/ipv4/igmp.c +--- linux-2.6.15.orig/net/ipv4/igmp.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/igmp.c 2006-01-27 14:48:08.000000000 +0300 +@@ -2328,7 +2328,8 @@ static inline struct ip_sf_list *igmp_mc + struct ip_mc_list *im = NULL; + struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); + +- for (state->dev = dev_base, state->idev = NULL, state->im = NULL; ++ for (state->dev = dev_base, ++ state->idev = NULL, state->im = NULL; + state->dev; + state->dev = state->dev->next) { + struct in_device *idev; +diff -uprN linux-2.6.15.orig/net/ipv4/inet_connection_sock.c linux-2.6.15-ve025stab014/net/ipv4/inet_connection_sock.c +--- linux-2.6.15.orig/net/ipv4/inet_connection_sock.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/inet_connection_sock.c 2006-01-27 14:48:08.000000000 +0300 +@@ -25,6 +25,9 @@ + #include <net/tcp_states.h> + #include <net/xfrm.h> + ++#include <ub/ub_net.h> ++#include <ub/ub_orphan.h> ++ + #ifdef INET_CSK_DEBUG + const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; + EXPORT_SYMBOL(inet_csk_timer_bug_msg); +@@ -47,6 +50,7 @@ static inline int inet_csk_bind_conflict + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + !inet_v6_ipv6only(sk2) && ++ !ve_accessible_strict(VE_OWNER_SK(sk), VE_OWNER_SK(sk2)) && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { +@@ -72,7 +76,9 @@ int inet_csk_get_port(struct inet_hashin + struct hlist_node *node; + struct inet_bind_bucket *tb; + int ret; ++ struct ve_struct *env; + ++ env = VE_OWNER_SK(sk); + local_bh_disable(); + if (!snum) { + int low = sysctl_local_port_range[0]; +@@ -81,11 +87,15 @@ int inet_csk_get_port(struct inet_hashin + int rover = net_random() % (high - low) + low; + + do { +- head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; ++ head = &hashinfo->bhash[inet_bhashfn(rover, ++ hashinfo->bhash_size, VEID(env))]; + spin_lock(&head->lock); +- inet_bind_bucket_for_each(tb, node, &head->chain) ++ inet_bind_bucket_for_each(tb, node, &head->chain) { ++ if (!ve_accessible_strict(VE_OWNER_TB(tb),env)) ++ continue; + if (tb->port == rover) + goto next; ++ } + break; + next: + spin_unlock(&head->lock); +@@ -108,11 +118,15 @@ int inet_csk_get_port(struct inet_hashin + */ + snum = rover; + } else { +- head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; ++ head = &hashinfo->bhash[inet_bhashfn(snum, ++ hashinfo->bhash_size, VEID(env))]; + spin_lock(&head->lock); +- inet_bind_bucket_for_each(tb, node, &head->chain) ++ inet_bind_bucket_for_each(tb, node, &head->chain) { ++ if (!ve_accessible_strict(VE_OWNER_TB(tb), env)) ++ continue; + if (tb->port == snum) + goto tb_found; ++ } + } + tb = NULL; + goto tb_not_found; +@@ -131,7 +145,7 @@ tb_found: + } + tb_not_found: + ret = 1; +- if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) ++ if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum, env)) == NULL) + goto fail_unlock; + if (hlist_empty(&tb->owners)) { + if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) +@@ -536,7 +550,7 @@ void inet_csk_destroy_sock(struct sock * + + sk_refcnt_debug_release(sk); + +- atomic_dec(sk->sk_prot->orphan_count); ++ ub_dec_orphan_count(sk); + sock_put(sk); + } + +@@ -616,7 +630,7 @@ void inet_csk_listen_stop(struct sock *s + + sock_orphan(child); + +- atomic_inc(sk->sk_prot->orphan_count); ++ ub_inc_orphan_count(sk); + + inet_csk_destroy_sock(child); + +diff -uprN linux-2.6.15.orig/net/ipv4/inet_diag.c linux-2.6.15-ve025stab014/net/ipv4/inet_diag.c +--- linux-2.6.15.orig/net/ipv4/inet_diag.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/inet_diag.c 2006-01-27 14:48:08.000000000 +0300 +@@ -595,7 +595,9 @@ static int inet_diag_dump(struct sk_buff + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); + const struct inet_diag_handler *handler; + struct inet_hashinfo *hashinfo; ++ struct ve_struct *ve; + ++ ve = get_exec_env(); + handler = inet_diag_table[cb->nlh->nlmsg_type]; + BUG_ON(handler == NULL); + hashinfo = handler->idiag_hashinfo; +@@ -616,6 +618,8 @@ static int inet_diag_dump(struct sk_buff + sk_for_each(sk, node, &hashinfo->listening_hash[i]) { + struct inet_sock *inet = inet_sk(sk); + ++ if (!ve_accessible(VE_OWNER_SK(sk), ve)) ++ continue; + if (num < s_num) { + num++; + continue; +@@ -677,6 +681,8 @@ skip_listen_ht: + sk_for_each(sk, node, &head->chain) { + struct inet_sock *inet = inet_sk(sk); + ++ if (!ve_accessible(VE_OWNER_SK(sk), ve)) ++ continue; + if (num < s_num) + goto next_normal; + if (!(r->idiag_states & (1 << sk->sk_state))) +@@ -699,6 +705,8 @@ next_normal: + &hashinfo->ehash[i + hashinfo->ehash_size].chain) { + struct inet_sock *inet = inet_sk(sk); + ++ if (!ve_accessible_veid(inet_twsk(sk)->tw_owner_env, VEID(ve))) ++ continue; + if (num < s_num) + goto next_dying; + if (r->id.idiag_sport != inet->sport && +diff -uprN linux-2.6.15.orig/net/ipv4/inet_hashtables.c linux-2.6.15-ve025stab014/net/ipv4/inet_hashtables.c +--- linux-2.6.15.orig/net/ipv4/inet_hashtables.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/inet_hashtables.c 2006-01-27 14:48:08.000000000 +0300 +@@ -28,7 +28,8 @@ + */ + struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep, + struct inet_bind_hashbucket *head, +- const unsigned short snum) ++ const unsigned short snum, ++ struct ve_struct *ve) + { + struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC); + +@@ -36,6 +37,7 @@ struct inet_bind_bucket *inet_bind_bucke + tb->port = snum; + tb->fastreuse = 0; + INIT_HLIST_HEAD(&tb->owners); ++ SET_VE_OWNER_TB(tb, ve); + hlist_add_head(&tb->node, &head->chain); + } + return tb; +@@ -69,10 +71,13 @@ EXPORT_SYMBOL(inet_bind_hash); + */ + static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) + { +- const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); +- struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; ++ int bhash; ++ struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + ++ bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size, ++ VEID(VE_OWNER_SK(sk))); ++ head = &hashinfo->bhash[bhash]; + spin_lock(&head->lock); + tb = inet_csk(sk)->icsk_bind_hash; + __sk_del_bind_node(sk); +@@ -128,7 +133,8 @@ EXPORT_SYMBOL(inet_listen_wlock); + * wildcarded during the search since they can never be otherwise. + */ + struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr, +- const unsigned short hnum, const int dif) ++ const unsigned short hnum, const int dif, ++ struct ve_struct *env) + { + struct sock *result = NULL, *sk; + const struct hlist_node *node; +@@ -137,6 +143,8 @@ struct sock *__inet_lookup_listener(cons + sk_for_each(sk, node, head) { + const struct inet_sock *inet = inet_sk(sk); + ++ if (!ve_accessible_strict(VE_OWNER_SK(sk), env)) ++ continue; + if (inet->num == hnum && !ipv6_only_sock(sk)) { + const __u32 rcv_saddr = inet->rcv_saddr; + int score = sk->sk_family == PF_INET ? 1 : 0; +diff -uprN linux-2.6.15.orig/net/ipv4/inet_timewait_sock.c linux-2.6.15-ve025stab014/net/ipv4/inet_timewait_sock.c +--- linux-2.6.15.orig/net/ipv4/inet_timewait_sock.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/inet_timewait_sock.c 2006-01-27 14:48:08.000000000 +0300 +@@ -32,7 +32,8 @@ void __inet_twsk_kill(struct inet_timewa + write_unlock(&ehead->lock); + + /* Disassociate with bind bucket. */ +- bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; ++ bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, ++ hashinfo->bhash_size, tw->tw_owner_env)]; + spin_lock(&bhead->lock); + tb = tw->tw_tb; + __hlist_del(&tw->tw_bind_node); +@@ -66,7 +67,8 @@ void __inet_twsk_hashdance(struct inet_t + Note, that any socket with inet->num != 0 MUST be bound in + binding cache, even if it is closed. + */ +- bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)]; ++ bhead = &hashinfo->bhash[inet_bhashfn(inet->num, ++ hashinfo->bhash_size, tw->tw_owner_env)]; + spin_lock(&bhead->lock); + tw->tw_tb = icsk->icsk_bind_hash; + BUG_TRAP(icsk->icsk_bind_hash); +@@ -90,8 +92,13 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance) + + struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) + { +- struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, +- SLAB_ATOMIC); ++ struct user_beancounter *ub; ++ struct inet_timewait_sock *tw; ++ ++ ub = set_exec_ub(sock_bc(sk)->ub); ++ tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, SLAB_ATOMIC); ++ (void)set_exec_ub(ub); ++ + if (tw != NULL) { + const struct inet_sock *inet = inet_sk(sk); + +diff -uprN linux-2.6.15.orig/net/ipv4/ip_forward.c linux-2.6.15-ve025stab014/net/ipv4/ip_forward.c +--- linux-2.6.15.orig/net/ipv4/ip_forward.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/ip_forward.c 2006-01-27 14:48:08.000000000 +0300 +@@ -87,6 +87,24 @@ int ip_forward(struct sk_buff *skb) + if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + goto sr_failed; + ++ /* ++ * We try to optimize forwarding of VE packets: ++ * do not decrement TTL (and so save skb_cow) ++ * during forwarding of outgoing pkts from VE. ++ * For incoming pkts we still do ttl decr, ++ * since such skb is not cloned and does not require ++ * actual cow. So, there is at least one place ++ * in pkts path with mandatory ttl decr, that is ++ * sufficient to prevent routing loops. ++ */ ++ iph = skb->nh.iph; ++ if ( ++#ifdef CONFIG_IP_ROUTE_NAT ++ (rt->rt_flags & RTCF_NAT) == 0 && /* no NAT mangling expected */ ++#endif /* and */ ++ (skb->dev->features & NETIF_F_VENET)) /* src is VENET device */ ++ goto no_ttl_decr; ++ + /* We are about to mangle packet. Copy it! */ + if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) + goto drop; +@@ -95,6 +113,8 @@ int ip_forward(struct sk_buff *skb) + /* Decrease ttl after skb cow done */ + ip_decrease_ttl(iph); + ++no_ttl_decr: ++ + /* + * We now generate an ICMP HOST REDIRECT giving the route + * we calculated. +diff -uprN linux-2.6.15.orig/net/ipv4/ip_fragment.c linux-2.6.15-ve025stab014/net/ipv4/ip_fragment.c +--- linux-2.6.15.orig/net/ipv4/ip_fragment.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/ip_fragment.c 2006-01-27 14:48:08.000000000 +0300 +@@ -42,6 +42,7 @@ + #include <linux/udp.h> + #include <linux/inet.h> + #include <linux/netfilter_ipv4.h> ++#include <linux/ve_owner.h> + + /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 + * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c +@@ -91,8 +92,12 @@ struct ipq { + struct timer_list timer; /* when will this queue expire? */ + int iif; + struct timeval stamp; ++ struct ve_struct *owner_env; + }; + ++DCL_VE_OWNER_PROTO(IPQ, struct ipq, owner_env) ++DCL_VE_OWNER(IPQ, struct ipq, owner_env) ++ + /* Hash table. */ + + #define IPQ_HASHSZ 64 +@@ -176,7 +181,8 @@ static __inline__ void frag_free_queue(s + + static __inline__ struct ipq *frag_alloc_queue(void) + { +- struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC); ++ struct ipq *qp = kmalloc(sizeof(struct ipq) + sizeof(void *), ++ GFP_ATOMIC); + + if(!qp) + return NULL; +@@ -269,6 +275,9 @@ static void ip_evictor(void) + static void ip_expire(unsigned long arg) + { + struct ipq *qp = (struct ipq *) arg; ++ struct ve_struct *envid; ++ ++ envid = set_exec_env(VE_OWNER_IPQ(qp)); + + spin_lock(&qp->lock); + +@@ -291,6 +300,8 @@ static void ip_expire(unsigned long arg) + out: + spin_unlock(&qp->lock); + ipq_put(qp, NULL); ++ ++ (void)set_exec_env(envid); + } + + /* Creation primitives. */ +@@ -312,7 +323,8 @@ static struct ipq *ip_frag_intern(unsign + qp->saddr == qp_in->saddr && + qp->daddr == qp_in->daddr && + qp->protocol == qp_in->protocol && +- qp->user == qp_in->user) { ++ qp->user == qp_in->user && ++ qp->owner_env == get_exec_env()) { + atomic_inc(&qp->refcnt); + write_unlock(&ipfrag_lock); + qp_in->last_in |= COMPLETE; +@@ -361,6 +373,8 @@ static struct ipq *ip_frag_create(unsign + spin_lock_init(&qp->lock); + atomic_set(&qp->refcnt, 1); + ++ SET_VE_OWNER_IPQ(qp, get_exec_env()); ++ + return ip_frag_intern(hash, qp); + + out_nomem: +@@ -387,7 +401,8 @@ static inline struct ipq *ip_find(struct + qp->saddr == saddr && + qp->daddr == daddr && + qp->protocol == protocol && +- qp->user == user) { ++ qp->user == user && ++ qp->owner_env == get_exec_env()) { + atomic_inc(&qp->refcnt); + read_unlock(&ipfrag_lock); + return qp; +@@ -653,6 +668,9 @@ struct sk_buff *ip_defrag(struct sk_buff + qp->meat == qp->len) + ret = ip_frag_reasm(qp, dev); + ++ if (ret) ++ SET_VE_OWNER_SKB(ret, VE_OWNER_SKB(skb)); ++ + spin_unlock(&qp->lock); + ipq_put(qp, NULL); + return ret; +@@ -663,6 +681,51 @@ struct sk_buff *ip_defrag(struct sk_buff + return NULL; + } + ++#ifdef CONFIG_VE ++/* XXX */ ++void ip_fragment_cleanup(struct ve_struct *envid) ++{ ++ int i, progress; ++ ++ /* All operations with fragment queues are performed from NET_RX/TX ++ * soft interrupts or from timer context. --Den */ ++ local_bh_disable(); ++ do { ++ progress = 0; ++ for (i = 0; i < IPQ_HASHSZ; i++) { ++ struct ipq *qp; ++ struct hlist_node *p, *n; ++ ++ if (hlist_empty(&ipq_hash[i])) ++ continue; ++inner_restart: ++ read_lock(&ipfrag_lock); ++ hlist_for_each_entry_safe(qp, p, n, ++ &ipq_hash[i], list) { ++ if (!ve_accessible_strict( ++ VE_OWNER_IPQ(qp), ++ envid)) ++ continue; ++ atomic_inc(&qp->refcnt); ++ read_unlock(&ipfrag_lock); ++ ++ spin_lock(&qp->lock); ++ if (!(qp->last_in&COMPLETE)) ++ ipq_kill(qp); ++ spin_unlock(&qp->lock); ++ ++ ipq_put(qp, NULL); ++ progress = 1; ++ goto inner_restart; ++ } ++ read_unlock(&ipfrag_lock); ++ } ++ } while(progress); ++ local_bh_enable(); ++} ++EXPORT_SYMBOL(ip_fragment_cleanup); ++#endif ++ + void ipfrag_init(void) + { + ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ +diff -uprN linux-2.6.15.orig/net/ipv4/ipmr.c linux-2.6.15-ve025stab014/net/ipv4/ipmr.c +--- linux-2.6.15.orig/net/ipv4/ipmr.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/ipmr.c 2006-01-27 14:48:08.000000000 +0300 +@@ -834,7 +834,7 @@ static void mrtsock_destruct(struct sock + { + rtnl_lock(); + if (sk == mroute_socket) { +- ipv4_devconf.mc_forwarding--; ++ ve_ipv4_devconf.mc_forwarding--; + + write_lock_bh(&mrt_lock); + mroute_socket=NULL; +@@ -885,7 +885,7 @@ int ip_mroute_setsockopt(struct sock *sk + mroute_socket=sk; + write_unlock_bh(&mrt_lock); + +- ipv4_devconf.mc_forwarding++; ++ ve_ipv4_devconf.mc_forwarding++; + } + rtnl_unlock(); + return ret; +diff -uprN linux-2.6.15.orig/net/ipv4/ipvs/ip_vs_conn.c linux-2.6.15-ve025stab014/net/ipv4/ipvs/ip_vs_conn.c +--- linux-2.6.15.orig/net/ipv4/ipvs/ip_vs_conn.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/ipvs/ip_vs_conn.c 2006-01-27 14:48:06.000000000 +0300 +@@ -896,7 +896,8 @@ int ip_vs_conn_init(void) + /* Allocate ip_vs_conn slab cache */ + ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", + sizeof(struct ip_vs_conn), 0, +- SLAB_HWCACHE_ALIGN, NULL, NULL); ++ SLAB_HWCACHE_ALIGN | SLAB_UBC, ++ NULL, NULL); + if (!ip_vs_conn_cachep) { + vfree(ip_vs_conn_tab); + return -ENOMEM; +diff -uprN linux-2.6.15.orig/net/ipv4/ipvs/ip_vs_core.c linux-2.6.15-ve025stab014/net/ipv4/ipvs/ip_vs_core.c +--- linux-2.6.15.orig/net/ipv4/ipvs/ip_vs_core.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/ipvs/ip_vs_core.c 2006-01-27 14:48:08.000000000 +0300 +@@ -955,6 +955,10 @@ ip_vs_in(unsigned int hooknum, struct sk + * Big tappo: only PACKET_HOST (neither loopback nor mcasts) + * ... don't know why 1st test DOES NOT include 2nd (?) + */ ++ /* ++ * VZ: the question above is right. ++ * The second test is superfluous. ++ */ + if (unlikely(skb->pkt_type != PACKET_HOST + || skb->dev == &loopback_dev || skb->sk)) { + IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_core.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_core.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_core.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_core.c 2006-01-27 14:48:08.000000000 +0300 +@@ -49,6 +49,7 @@ + #include <linux/netfilter_ipv4/ip_conntrack_helper.h> + #include <linux/netfilter_ipv4/ip_conntrack_core.h> + #include <linux/netfilter_ipv4/listhelp.h> ++#include <ub/ub_mem.h> + + #define IP_CONNTRACK_VERSION "2.4" + +@@ -60,22 +61,39 @@ + + DEFINE_RWLOCK(ip_conntrack_lock); + +-/* ip_conntrack_standalone needs this */ +-atomic_t ip_conntrack_count = ATOMIC_INIT(0); ++#ifdef CONFIG_VE_IPTABLES ++#define ve_ip_conntrack_helpers \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_helpers) ++#define ve_ip_conntrack_max \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_max) ++#define ve_ip_conntrack_count \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_count) ++#else + + void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; + LIST_HEAD(ip_conntrack_expect_list); + struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; + static LIST_HEAD(helpers); ++struct list_head *ip_conntrack_hash; ++#define ve_ip_conntrack_count ip_conntrack_count ++#define ve_ip_conntrack_helpers helpers ++#define ve_ip_conntrack_max ip_conntrack_max ++#define ve_ip_conntrack_count ip_conntrack_count ++#endif ++ ++/* ip_conntrack_standalone needs this */ ++atomic_t ip_conntrack_count = ATOMIC_INIT(0); ++ + unsigned int ip_conntrack_htable_size = 0; + int ip_conntrack_max; +-struct list_head *ip_conntrack_hash; + static kmem_cache_t *ip_conntrack_cachep __read_mostly; + static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly; + struct ip_conntrack ip_conntrack_untracked; + unsigned int ip_ct_log_invalid; + static LIST_HEAD(unconfirmed); ++#ifndef CONFIG_VE + static int ip_conntrack_vmalloc; ++#endif + + static unsigned int ip_conntrack_next_id = 1; + static unsigned int ip_conntrack_expect_next_id = 1; +@@ -226,7 +244,7 @@ __ip_conntrack_expect_find(const struct + { + struct ip_conntrack_expect *i; + +- list_for_each_entry(i, &ip_conntrack_expect_list, list) { ++ list_for_each_entry(i, &ve_ip_conntrack_expect_list, list) { + if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) { + atomic_inc(&i->use); + return i; +@@ -255,7 +273,7 @@ find_expectation(const struct ip_conntra + { + struct ip_conntrack_expect *i; + +- list_for_each_entry(i, &ip_conntrack_expect_list, list) { ++ list_for_each_entry(i, &ve_ip_conntrack_expect_list, list) { + /* If master is not in hash table yet (ie. packet hasn't left + this machine yet), how can other end know about expected? + Hence these are not the droids you are looking for (if +@@ -284,7 +302,7 @@ void ip_ct_remove_expectations(struct ip + if (ct->expecting == 0) + return; + +- list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { ++ list_for_each_entry_safe(i, tmp, &ve_ip_conntrack_expect_list, list) { + if (i->master == ct && del_timer(&i->timeout)) { + ip_ct_unlink_expect(i); + ip_conntrack_expect_put(i); +@@ -302,8 +320,10 @@ clean_from_lists(struct ip_conntrack *ct + + ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); +- LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); +- LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); ++ LIST_DELETE(&ve_ip_conntrack_hash[ho], ++ &ct->tuplehash[IP_CT_DIR_ORIGINAL]); ++ LIST_DELETE(&ve_ip_conntrack_hash[hr], ++ &ct->tuplehash[IP_CT_DIR_REPLY]); + + /* Destroy all pending expectations */ + ip_ct_remove_expectations(ct); +@@ -329,8 +349,8 @@ destroy_conntrack(struct nf_conntrack *n + if (proto && proto->destroy) + proto->destroy(ct); + +- if (ip_conntrack_destroyed) +- ip_conntrack_destroyed(ct); ++ if (ve_ip_conntrack_destroyed) ++ ve_ip_conntrack_destroyed(ct); + + write_lock_bh(&ip_conntrack_lock); + /* Expectations will have been removed in clean_from_lists, +@@ -358,7 +378,11 @@ destroy_conntrack(struct nf_conntrack *n + static void death_by_timeout(unsigned long ul_conntrack) + { + struct ip_conntrack *ct = (void *)ul_conntrack; ++#ifdef CONFIG_VE_IPTABLES ++ struct ve_struct *old; + ++ old = set_exec_env(VE_OWNER_CT(ct)); ++#endif + write_lock_bh(&ip_conntrack_lock); + /* Inside lock so preempt is disabled on module removal path. + * Otherwise we can get spurious warnings. */ +@@ -366,6 +390,9 @@ static void death_by_timeout(unsigned lo + clean_from_lists(ct); + write_unlock_bh(&ip_conntrack_lock); + ip_conntrack_put(ct); ++#ifdef CONFIG_VE_IPTABLES ++ (void)set_exec_env(old); ++#endif + } + + static inline int +@@ -386,7 +413,7 @@ __ip_conntrack_find(const struct ip_conn + unsigned int hash = hash_conntrack(tuple); + + ASSERT_READ_LOCK(&ip_conntrack_lock); +- list_for_each_entry(h, &ip_conntrack_hash[hash], list) { ++ list_for_each_entry(h, &ve_ip_conntrack_hash[hash], list) { + if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { + CONNTRACK_STAT_INC(found); + return h; +@@ -418,9 +445,9 @@ static void __ip_conntrack_hash_insert(s + unsigned int repl_hash) + { + ct->id = ++ip_conntrack_next_id; +- list_prepend(&ip_conntrack_hash[hash], ++ list_prepend(&ve_ip_conntrack_hash[hash], + &ct->tuplehash[IP_CT_DIR_ORIGINAL].list); +- list_prepend(&ip_conntrack_hash[repl_hash], ++ list_prepend(&ve_ip_conntrack_hash[repl_hash], + &ct->tuplehash[IP_CT_DIR_REPLY].list); + } + +@@ -471,11 +498,11 @@ __ip_conntrack_confirm(struct sk_buff ** + /* See if there's one in the list already, including reverse: + NAT could have grabbed it without realizing, since we're + not in the hash. If there is, we lost race. */ +- if (!LIST_FIND(&ip_conntrack_hash[hash], ++ if (!LIST_FIND(&ve_ip_conntrack_hash[hash], + conntrack_tuple_cmp, + struct ip_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL) +- && !LIST_FIND(&ip_conntrack_hash[repl_hash], ++ && !LIST_FIND(&ve_ip_conntrack_hash[repl_hash], + conntrack_tuple_cmp, + struct ip_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { +@@ -569,7 +596,7 @@ static inline int helper_cmp(const struc + static struct ip_conntrack_helper * + __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple) + { +- return LIST_FIND(&helpers, helper_cmp, ++ return LIST_FIND(&ve_ip_conntrack_helpers, helper_cmp, + struct ip_conntrack_helper *, + tuple); + } +@@ -605,7 +632,7 @@ void ip_conntrack_helper_put(struct ip_c + struct ip_conntrack_protocol * + __ip_conntrack_proto_find(u_int8_t protocol) + { +- return ip_ct_protos[protocol]; ++ return ve_ip_ct_protos[protocol]; + } + + /* this is guaranteed to always return a valid protocol helper, since +@@ -632,29 +659,32 @@ void ip_conntrack_proto_put(struct ip_co + } + + struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig, +- struct ip_conntrack_tuple *repl) ++ struct ip_conntrack_tuple *repl, struct user_beancounter *ub) + { + struct ip_conntrack *conntrack; ++ struct user_beancounter *old_ub; + + if (!ip_conntrack_hash_rnd_initted) { + get_random_bytes(&ip_conntrack_hash_rnd, 4); + ip_conntrack_hash_rnd_initted = 1; + } + +- if (ip_conntrack_max +- && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { ++ if (ve_ip_conntrack_max ++ && atomic_read(&ve_ip_conntrack_count) >= ve_ip_conntrack_max) { + unsigned int hash = hash_conntrack(orig); + /* Try dropping from this hash chain. */ +- if (!early_drop(&ip_conntrack_hash[hash])) { ++ if (!early_drop(&ve_ip_conntrack_hash[hash])) { + if (net_ratelimit()) +- printk(KERN_WARNING +- "ip_conntrack: table full, dropping" +- " packet.\n"); ++ ve_printk(VE_LOG_BOTH, KERN_WARNING ++ "ip_conntrack: VPS %d: table full, dropping" ++ " packet.\n", VEID(get_exec_env())); + return ERR_PTR(-ENOMEM); + } + } + ++ old_ub = set_exec_ub(ub); + conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); ++ (void)set_exec_ub(old_ub); + if (!conntrack) { + DEBUGP("Can't allocate conntrack.\n"); + return ERR_PTR(-ENOMEM); +@@ -669,8 +699,11 @@ struct ip_conntrack *ip_conntrack_alloc( + init_timer(&conntrack->timeout); + conntrack->timeout.data = (unsigned long)conntrack; + conntrack->timeout.function = death_by_timeout; ++#ifdef CONFIG_VE_IPTABLES ++ SET_VE_OWNER_CT(conntrack, get_exec_env()); ++#endif + +- atomic_inc(&ip_conntrack_count); ++ atomic_inc(&ve_ip_conntrack_count); + + return conntrack; + } +@@ -678,7 +711,7 @@ struct ip_conntrack *ip_conntrack_alloc( + void + ip_conntrack_free(struct ip_conntrack *conntrack) + { +- atomic_dec(&ip_conntrack_count); ++ atomic_dec(&ve_ip_conntrack_count); + kmem_cache_free(ip_conntrack_cachep, conntrack); + } + +@@ -692,13 +725,22 @@ init_conntrack(struct ip_conntrack_tuple + struct ip_conntrack *conntrack; + struct ip_conntrack_tuple repl_tuple; + struct ip_conntrack_expect *exp; ++ struct user_beancounter *ub; + + if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { + DEBUGP("Can't invert tuple.\n"); + return NULL; + } + +- conntrack = ip_conntrack_alloc(tuple, &repl_tuple); ++#ifdef CONFIG_USER_RESOURCE ++ if (skb->dev != NULL) /* received skb */ ++ ub = netdev_bc(skb->dev)->exec_ub; ++ else if (skb->sk != NULL) /* sent skb */ ++ ub = sock_bc(skb->sk)->ub; ++ else ++#endif ++ ub = NULL; ++ conntrack = ip_conntrack_alloc(tuple, &repl_tuple, ub); + if (conntrack == NULL || IS_ERR(conntrack)) + return (struct ip_conntrack_tuple_hash *)conntrack; + +@@ -925,7 +967,7 @@ void ip_conntrack_unexpect_related(struc + + write_lock_bh(&ip_conntrack_lock); + /* choose the the oldest expectation to evict */ +- list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { ++ list_for_each_entry_reverse(i, &ve_ip_conntrack_expect_list, list) { + if (expect_matches(i, exp) && del_timer(&i->timeout)) { + ip_ct_unlink_expect(i); + write_unlock_bh(&ip_conntrack_lock); +@@ -963,7 +1005,7 @@ static void ip_conntrack_expect_insert(s + { + atomic_inc(&exp->use); + exp->master->expecting++; +- list_add(&exp->list, &ip_conntrack_expect_list); ++ list_add(&exp->list, &ve_ip_conntrack_expect_list); + + init_timer(&exp->timeout); + exp->timeout.data = (unsigned long)exp; +@@ -981,7 +1023,7 @@ static void evict_oldest_expect(struct i + { + struct ip_conntrack_expect *i; + +- list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { ++ list_for_each_entry_reverse(i, &ve_ip_conntrack_expect_list, list) { + if (i->master == master) { + if (del_timer(&i->timeout)) { + ip_ct_unlink_expect(i); +@@ -1012,7 +1054,7 @@ int ip_conntrack_expect_related(struct i + DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); + + write_lock_bh(&ip_conntrack_lock); +- list_for_each_entry(i, &ip_conntrack_expect_list, list) { ++ list_for_each_entry(i, &ve_ip_conntrack_expect_list, list) { + if (expect_matches(i, expect)) { + /* Refresh timer: if it's dying, ignore.. */ + if (refresh_timer(i)) { +@@ -1060,18 +1102,48 @@ int ip_conntrack_helper_register(struct + { + BUG_ON(me->timeout == 0); + write_lock_bh(&ip_conntrack_lock); +- list_prepend(&helpers, me); ++ list_prepend(&ve_ip_conntrack_helpers, me); + write_unlock_bh(&ip_conntrack_lock); + + return 0; + } + ++int virt_ip_conntrack_helper_register(struct ip_conntrack_helper *me) ++{ ++ int ret; ++ struct module *mod = me->me; ++ ++ if (!ve_is_super(get_exec_env())) { ++ struct ip_conntrack_helper *tmp; ++ __module_get(mod); ++ ret = -ENOMEM; ++ tmp = kmalloc(sizeof(struct ip_conntrack_helper), GFP_KERNEL); ++ if (!tmp) ++ goto nomem; ++ memcpy(tmp, me, sizeof(struct ip_conntrack_helper)); ++ me = tmp; ++ } ++ ++ ret = ip_conntrack_helper_register(me); ++ if (ret) ++ goto out; ++ ++ return 0; ++out: ++ if (!ve_is_super(get_exec_env())){ ++ kfree(me); ++nomem: ++ module_put(mod); ++ } ++ return ret; ++} ++ + struct ip_conntrack_helper * + __ip_conntrack_helper_find_byname(const char *name) + { + struct ip_conntrack_helper *h; + +- list_for_each_entry(h, &helpers, list) { ++ list_for_each_entry(h, &ve_ip_conntrack_helpers, list) { + if (!strcmp(h->name, name)) + return h; + } +@@ -1096,10 +1168,10 @@ void ip_conntrack_helper_unregister(stru + + /* Need write lock here, to delete helper. */ + write_lock_bh(&ip_conntrack_lock); +- LIST_DELETE(&helpers, me); ++ LIST_DELETE(&ve_ip_conntrack_helpers, me); + + /* Get rid of expectations */ +- list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { ++ list_for_each_entry_safe(exp, tmp, &ve_ip_conntrack_expect_list, list) { + if (exp->master->helper == me && del_timer(&exp->timeout)) { + ip_ct_unlink_expect(exp); + ip_conntrack_expect_put(exp); +@@ -1108,7 +1180,7 @@ void ip_conntrack_helper_unregister(stru + /* Get rid of expecteds, set helpers to NULL. */ + LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me); + for (i = 0; i < ip_conntrack_htable_size; i++) +- LIST_FIND_W(&ip_conntrack_hash[i], unhelp, ++ LIST_FIND_W(&ve_ip_conntrack_hash[i], unhelp, + struct ip_conntrack_tuple_hash *, me); + write_unlock_bh(&ip_conntrack_lock); + +@@ -1116,6 +1188,25 @@ void ip_conntrack_helper_unregister(stru + synchronize_net(); + } + ++void virt_ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) ++{ ++ ++ if (!ve_is_super(get_exec_env())) { ++ read_lock_bh(&ip_conntrack_lock); ++ me = list_named_find(&ve_ip_conntrack_helpers, me->name); ++ read_unlock_bh(&ip_conntrack_lock); ++ if (!me) ++ return; ++ } ++ ++ ip_conntrack_helper_unregister(me); ++ ++ if (!ve_is_super(get_exec_env())) { ++ module_put(me->me); ++ kfree(me); ++ } ++} ++ + /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ + void __ip_ct_refresh_acct(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, +@@ -1246,7 +1337,7 @@ get_next_corpse(int (*iter)(struct ip_co + + write_lock_bh(&ip_conntrack_lock); + for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { +- h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter, ++ h = LIST_FIND_W(&ve_ip_conntrack_hash[*bucket], do_iter, + struct ip_conntrack_tuple_hash *, iter, data); + if (h) + break; +@@ -1359,12 +1450,17 @@ static void free_conntrack_hash(struct l + get_order(sizeof(struct list_head) * size)); + } + ++static void ip_conntrack_cache_free(void) ++{ ++ kmem_cache_destroy(ip_conntrack_expect_cachep); ++ kmem_cache_destroy(ip_conntrack_cachep); ++ nf_unregister_sockopt(&so_getorigdst); ++} ++ + /* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ + void ip_conntrack_cleanup(void) + { +- ip_ct_attach = NULL; +- + /* This makes sure all current packets have passed through + netfilter framework. Roll on, two-stage module + delete... */ +@@ -1373,19 +1469,29 @@ void ip_conntrack_cleanup(void) + ip_ct_event_cache_flush(); + i_see_dead_people: + ip_conntrack_flush(); +- if (atomic_read(&ip_conntrack_count) != 0) { ++ if (atomic_read(&ve_ip_conntrack_count) != 0) { + schedule(); + goto i_see_dead_people; + } +- /* wait until all references to ip_conntrack_untracked are dropped */ +- while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) +- schedule(); +- +- kmem_cache_destroy(ip_conntrack_cachep); +- kmem_cache_destroy(ip_conntrack_expect_cachep); +- free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, ++ if (ve_is_super(get_exec_env())) { ++ /* wait until all references to ip_conntrack_untracked are ++ * dropped */ ++ while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) ++ schedule(); ++ ip_ct_attach = NULL; ++ ip_conntrack_cache_free(); ++ } ++ free_conntrack_hash(ve_ip_conntrack_hash, ve_ip_conntrack_vmalloc, + ip_conntrack_htable_size); +- nf_unregister_sockopt(&so_getorigdst); ++ INIT_LIST_HEAD(&ve_ip_conntrack_expect_list); ++ INIT_LIST_HEAD(&ve_ip_conntrack_helpers); ++ atomic_set(&ve_ip_conntrack_count, 0); ++ ve_ip_conntrack_max = 0; ++#ifdef CONFIG_VE_IPTABLES ++ kfree(ve_ip_ct_protos); ++ ve_ip_ct_protos = NULL; ++ kfree(get_exec_env()->_ip_conntrack); ++#endif + } + + static struct list_head *alloc_hashtable(int size, int *vmalloced) +@@ -1394,13 +1500,13 @@ static struct list_head *alloc_hashtable + unsigned int i; + + *vmalloced = 0; +- hash = (void*)__get_free_pages(GFP_KERNEL, ++ hash = (void*)__get_free_pages(GFP_KERNEL_UBC, + get_order(sizeof(struct list_head) + * size)); + if (!hash) { + *vmalloced = 1; + printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n"); +- hash = vmalloc(sizeof(struct list_head) * size); ++ hash = ub_vmalloc(sizeof(struct list_head) * size); + } + + if (hash) +@@ -1436,8 +1542,8 @@ static int set_hashsize(const char *val, + + write_lock_bh(&ip_conntrack_lock); + for (i = 0; i < ip_conntrack_htable_size; i++) { +- while (!list_empty(&ip_conntrack_hash[i])) { +- h = list_entry(ip_conntrack_hash[i].next, ++ while (!list_empty(&ve_ip_conntrack_hash[i])) { ++ h = list_entry(ve_ip_conntrack_hash[i].next, + struct ip_conntrack_tuple_hash, list); + list_del(&h->list); + bucket = __hash_conntrack(&h->tuple, hashsize, rnd); +@@ -1445,12 +1551,12 @@ static int set_hashsize(const char *val, + } + } + old_size = ip_conntrack_htable_size; +- old_vmalloced = ip_conntrack_vmalloc; +- old_hash = ip_conntrack_hash; ++ old_vmalloced = ve_ip_conntrack_vmalloc; ++ old_hash = ve_ip_conntrack_hash; + + ip_conntrack_htable_size = hashsize; +- ip_conntrack_vmalloc = vmalloced; +- ip_conntrack_hash = hash; ++ ve_ip_conntrack_vmalloc = vmalloced; ++ ve_ip_conntrack_hash = hash; + ip_conntrack_hash_rnd = rnd; + write_unlock_bh(&ip_conntrack_lock); + +@@ -1461,9 +1567,8 @@ static int set_hashsize(const char *val, + module_param_call(hashsize, set_hashsize, param_get_uint, + &ip_conntrack_htable_size, 0600); + +-int __init ip_conntrack_init(void) ++static int ip_conntrack_cache_create(void) + { +- unsigned int i; + int ret; + + /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB +@@ -1477,70 +1582,124 @@ int __init ip_conntrack_init(void) + if (ip_conntrack_htable_size < 16) + ip_conntrack_htable_size = 16; + } +- ip_conntrack_max = 8 * ip_conntrack_htable_size; ++ ve_ip_conntrack_max = 8 * ip_conntrack_htable_size; + + printk("ip_conntrack version %s (%u buckets, %d max)" + " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION, +- ip_conntrack_htable_size, ip_conntrack_max, ++ ip_conntrack_htable_size, ve_ip_conntrack_max, + sizeof(struct ip_conntrack)); + + ret = nf_register_sockopt(&so_getorigdst); + if (ret != 0) { + printk(KERN_ERR "Unable to register netfilter socket option\n"); +- return ret; +- } +- +- ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size, +- &ip_conntrack_vmalloc); +- if (!ip_conntrack_hash) { +- printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); +- goto err_unreg_sockopt; ++ goto out_sockopt; + } + ++ ret = -ENOMEM; + ip_conntrack_cachep = kmem_cache_create("ip_conntrack", + sizeof(struct ip_conntrack), 0, +- 0, NULL, NULL); ++ SLAB_UBC, NULL, NULL); + if (!ip_conntrack_cachep) { + printk(KERN_ERR "Unable to create ip_conntrack slab cache\n"); +- goto err_free_hash; ++ goto err_unreg_sockopt; + } + + ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect", + sizeof(struct ip_conntrack_expect), +- 0, 0, NULL, NULL); ++ 0, SLAB_UBC, NULL, NULL); + if (!ip_conntrack_expect_cachep) { + printk(KERN_ERR "Unable to create ip_expect slab cache\n"); + goto err_free_conntrack_slab; + } ++ return 0; ++ ++err_free_conntrack_slab: ++ kmem_cache_destroy(ip_conntrack_cachep); ++err_unreg_sockopt: ++ nf_unregister_sockopt(&so_getorigdst); ++out_sockopt: ++ return ret; ++} ++ ++int ip_conntrack_init(void) ++{ ++ struct ve_struct *env; ++ unsigned int i; ++ int ret; ++ ++ env = get_exec_env(); ++#ifdef CONFIG_VE_IPTABLES ++ ret = -ENOMEM; ++ env->_ip_conntrack = ++ kmalloc(sizeof(struct ve_ip_conntrack), GFP_KERNEL); ++ if (!env->_ip_conntrack) ++ goto out; ++ memset(env->_ip_conntrack, 0, sizeof(struct ve_ip_conntrack)); ++ if (ve_is_super(env)) { ++ ret = ip_conntrack_cache_create(); ++ if (ret) ++ goto cache_fail; ++ } else ++ ve_ip_conntrack_max = 8 * ip_conntrack_htable_size; ++#else /* CONFIG_VE_IPTABLES */ ++ ret = ip_conntrack_cache_create(); ++ if (ret) ++ goto out; ++#endif ++ ++ ret = -ENOMEM; ++ ve_ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size, ++ &ve_ip_conntrack_vmalloc); ++ if (!ve_ip_conntrack_hash) { ++ printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); ++ goto err_free_cache; ++ } ++ ++#ifdef CONFIG_VE_IPTABLES ++ ve_ip_ct_protos = (struct ip_conntrack_protocol **) ++ ub_kmalloc(sizeof(void *)*MAX_IP_CT_PROTO, GFP_KERNEL); ++ if (!ve_ip_ct_protos) ++ goto err_free_hash; ++#endif + + /* Don't NEED lock here, but good form anyway. */ + write_lock_bh(&ip_conntrack_lock); + for (i = 0; i < MAX_IP_CT_PROTO; i++) +- ip_ct_protos[i] = &ip_conntrack_generic_protocol; ++ ve_ip_ct_protos[i] = &ip_conntrack_generic_protocol; + /* Sew in builtin protocols. */ +- ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; +- ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; +- ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; ++ ve_ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; ++ ve_ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; ++ ve_ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; + write_unlock_bh(&ip_conntrack_lock); + +- /* For use by ipt_REJECT */ +- ip_ct_attach = ip_conntrack_attach; ++ INIT_LIST_HEAD(&ve_ip_conntrack_expect_list); ++ INIT_LIST_HEAD(&ve_ip_conntrack_helpers); + +- /* Set up fake conntrack: +- - to never be deleted, not in any hashes */ +- atomic_set(&ip_conntrack_untracked.ct_general.use, 1); +- /* - and look it like as a confirmed connection */ +- set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status); ++ if (ve_is_super(env)) { ++ /* For use by ipt_REJECT */ ++ ip_ct_attach = ip_conntrack_attach; ++ ++ /* Set up fake conntrack: ++ - to never be deleted, not in any hashes */ ++ atomic_set(&ip_conntrack_untracked.ct_general.use, 1); ++ /* - and look it like as a confirmed connection */ ++ set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status); ++ } + +- return ret; ++ return 0; + +-err_free_conntrack_slab: +- kmem_cache_destroy(ip_conntrack_cachep); ++#ifdef CONFIG_VE_IPTABLES + err_free_hash: +- free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, ++#endif ++ free_conntrack_hash(ve_ip_conntrack_hash, ve_ip_conntrack_vmalloc, + ip_conntrack_htable_size); +-err_unreg_sockopt: +- nf_unregister_sockopt(&so_getorigdst); +- +- return -ENOMEM; ++err_free_cache: ++ if (ve_is_super(env)) ++ ip_conntrack_cache_free(); ++#ifdef CONFIG_VE_IPTABLES ++cache_fail: ++ kfree(env->_ip_conntrack); ++#endif ++out: ++ return ret; + } +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_ftp.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_ftp.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_ftp.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_ftp.c 2006-01-27 14:48:08.000000000 +0300 +@@ -15,6 +15,7 @@ + #include <linux/ctype.h> + #include <net/checksum.h> + #include <net/tcp.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter_ipv4/ip_conntrack_helper.h> + #include <linux/netfilter_ipv4/ip_conntrack_ftp.h> +@@ -28,14 +29,6 @@ MODULE_DESCRIPTION("ftp connection track + static char *ftp_buffer; + static DEFINE_SPINLOCK(ip_ftp_lock); + +-#define MAX_PORTS 8 +-static unsigned short ports[MAX_PORTS]; +-static int ports_c; +-module_param_array(ports, ushort, &ports_c, 0400); +- +-static int loose; +-module_param(loose, int, 0600); +- + unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + enum ip_ct_ftp_type type, +@@ -45,6 +38,14 @@ unsigned int (*ip_nat_ftp_hook)(struct s + u32 *seq); + EXPORT_SYMBOL_GPL(ip_nat_ftp_hook); + ++#define MAX_PORTS 8 ++static unsigned short ports[MAX_PORTS]; ++static int ports_c; ++module_param_array(ports, ushort, &ports_c, 0400); ++ ++static int loose; ++module_param(loose, int, 0600); ++ + #if 0 + #define DEBUGP printk + #else +@@ -425,8 +426,8 @@ static int help(struct sk_buff **pskb, + + /* Now, NAT might want to mangle the packet, and register the + * (possibly changed) expectation itself. */ +- if (ip_nat_ftp_hook) +- ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, ++ if (ve_ip_nat_ftp_hook) ++ ret = ve_ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, + matchoff, matchlen, exp, &seq); + else { + /* Can't expect this? Best to drop packet now. */ +@@ -452,16 +453,39 @@ out_update_nl: + static struct ip_conntrack_helper ftp[MAX_PORTS]; + static char ftp_names[MAX_PORTS][sizeof("ftp-65535")]; + +-/* Not __exit: called from init() */ +-static void fini(void) ++void fini_iptable_ftp(void) + { + int i; + for (i = 0; i < ports_c; i++) { + DEBUGP("ip_ct_ftp: unregistering helper for port %d\n", + ports[i]); +- ip_conntrack_helper_unregister(&ftp[i]); ++ virt_ip_conntrack_helper_unregister(&ftp[i]); + } ++} + ++int init_iptable_ftp(void) ++{ ++ int i, ret; ++ ++ for (i = 0; i < ports_c; i++) { ++ DEBUGP("ip_ct_ftp: registering helper for port %d\n", ++ ports[i]); ++ ret = virt_ip_conntrack_helper_register(&ftp[i]); ++ if (ret) { ++ fini_iptable_ftp(); ++ return ret; ++ } ++ } ++ return 0; ++} ++ ++/* Not __exit: called from init() */ ++static void fini(void) ++{ ++ KSYMMODUNRESOLVE(ip_conntrack_ftp); ++ KSYMUNRESOLVE(init_iptable_ftp); ++ KSYMUNRESOLVE(fini_iptable_ftp); ++ fini_iptable_ftp(); + kfree(ftp_buffer); + } + +@@ -496,13 +520,17 @@ static int __init init(void) + + DEBUGP("ip_ct_ftp: registering helper for port %d\n", + ports[i]); +- ret = ip_conntrack_helper_register(&ftp[i]); ++ ret = virt_ip_conntrack_helper_register(&ftp[i]); + + if (ret) { + fini(); + return ret; + } + } ++ ++ KSYMRESOLVE(init_iptable_ftp); ++ KSYMRESOLVE(fini_iptable_ftp); ++ KSYMMODRESOLVE(ip_conntrack_ftp); + return 0; + } + +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_irc.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_irc.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_irc.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_irc.c 2006-01-27 14:48:08.000000000 +0300 +@@ -28,6 +28,7 @@ + #include <linux/ip.h> + #include <net/checksum.h> + #include <net/tcp.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter_ipv4/ip_conntrack_helper.h> + #include <linux/netfilter_ipv4/ip_conntrack_irc.h> +@@ -35,7 +36,7 @@ + + #define MAX_PORTS 8 + static unsigned short ports[MAX_PORTS]; +-static int ports_c; ++static int ports_c = 0; + static int max_dcc_channels = 8; + static unsigned int dcc_timeout = 300; + /* This is slow, but it's simple. --RR */ +@@ -244,6 +245,33 @@ static char irc_names[MAX_PORTS][sizeof( + + static void fini(void); + ++void fini_iptable_irc(void) ++{ ++ int i; ++ for (i = 0; i < ports_c; i++) { ++ DEBUGP("unregistering port %d\n", ++ ports[i]); ++ virt_ip_conntrack_helper_unregister(&irc_helpers[i]); ++ } ++} ++ ++int init_iptable_irc(void) ++{ ++ int i, ret; ++ ++ for (i = 0; i < ports_c; i++) { ++ DEBUGP("port #%d: %d\n", i, ports[i]); ++ ret = virt_ip_conntrack_helper_register(&irc_helpers[i]); ++ if (ret) { ++ printk("ip_conntrack_irc: ERROR registering port %d\n", ++ ports[i]); ++ fini_iptable_irc(); ++ return -EBUSY; ++ } ++ } ++ return 0; ++} ++ + static int __init init(void) + { + int i, ret; +@@ -287,7 +315,7 @@ static int __init init(void) + + DEBUGP("port #%d: %d\n", i, ports[i]); + +- ret = ip_conntrack_helper_register(hlpr); ++ ret = virt_ip_conntrack_helper_register(hlpr); + + if (ret) { + printk("ip_conntrack_irc: ERROR registering port %d\n", +@@ -296,6 +324,10 @@ static int __init init(void) + return -EBUSY; + } + } ++ ++ KSYMRESOLVE(init_iptable_irc); ++ KSYMRESOLVE(fini_iptable_irc); ++ KSYMMODRESOLVE(ip_conntrack_irc); + return 0; + } + +@@ -303,12 +335,10 @@ static int __init init(void) + * it is needed by the init function */ + static void fini(void) + { +- int i; +- for (i = 0; i < ports_c; i++) { +- DEBUGP("unregistering port %d\n", +- ports[i]); +- ip_conntrack_helper_unregister(&irc_helpers[i]); +- } ++ KSYMMODUNRESOLVE(ip_conntrack_irc); ++ KSYMUNRESOLVE(init_iptable_irc); ++ KSYMUNRESOLVE(fini_iptable_irc); ++ fini_iptable_irc(); + kfree(irc_buffer); + } + +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_netlink.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_netlink.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_netlink.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_netlink.c 2006-01-27 14:48:08.000000000 +0300 +@@ -29,6 +29,7 @@ + #include <linux/spinlock.h> + #include <linux/interrupt.h> + #include <linux/notifier.h> ++#include <net/sock.h> + + #include <linux/netfilter.h> + #include <linux/netfilter_ipv4/ip_conntrack.h> +@@ -39,6 +40,8 @@ + + #include <linux/netfilter/nfnetlink.h> + #include <linux/netfilter/nfnetlink_conntrack.h> ++#include <ub/beancounter.h> ++#include <ub/ub_sk.h> + + MODULE_LICENSE("GPL"); + +@@ -409,7 +412,7 @@ ctnetlink_dump_table(struct sk_buff *skb + + read_lock_bh(&ip_conntrack_lock); + for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { +- list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { ++ list_for_each_prev(i, &ve_ip_conntrack_hash[cb->args[0]]) { + h = (struct ip_conntrack_tuple_hash *) i; + if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; +@@ -446,7 +449,7 @@ ctnetlink_dump_table_w(struct sk_buff *s + + write_lock_bh(&ip_conntrack_lock); + for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { +- list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { ++ list_for_each_prev(i, &ve_ip_conntrack_hash[cb->args[0]]) { + h = (struct ip_conntrack_tuple_hash *) i; + if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; +@@ -1009,14 +1012,15 @@ ctnetlink_change_conntrack(struct ip_con + static int + ctnetlink_create_conntrack(struct nfattr *cda[], + struct ip_conntrack_tuple *otuple, +- struct ip_conntrack_tuple *rtuple) ++ struct ip_conntrack_tuple *rtuple, ++ struct user_beancounter *ub) + { + struct ip_conntrack *ct; + int err = -EINVAL; + + DEBUGP("entered %s\n", __FUNCTION__); + +- ct = ip_conntrack_alloc(otuple, rtuple); ++ ct = ip_conntrack_alloc(otuple, rtuple, ub); + if (ct == NULL || IS_ERR(ct)) + return -ENOMEM; + +@@ -1093,8 +1097,16 @@ ctnetlink_new_conntrack(struct sock *ctn + write_unlock_bh(&ip_conntrack_lock); + DEBUGP("no such conntrack, create new\n"); + err = -ENOENT; +- if (nlh->nlmsg_flags & NLM_F_CREATE) +- err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); ++ if (nlh->nlmsg_flags & NLM_F_CREATE) { ++#ifdef CONFIG_USER_RESOURCE ++ if (skb->sk) ++ err = ctnetlink_create_conntrack(cda, &otuple, ++ &rtuple, sock_bc(skb->sk)->ub); ++ else ++#endif ++ err = ctnetlink_create_conntrack(cda, ++ &otuple, &rtuple, NULL); ++ } + return err; + } + /* implicit 'else' */ +@@ -1257,7 +1269,7 @@ ctnetlink_exp_dump_table(struct sk_buff + DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id); + + read_lock_bh(&ip_conntrack_lock); +- list_for_each_prev(i, &ip_conntrack_expect_list) { ++ list_for_each_prev(i, &ve_ip_conntrack_expect_list) { + exp = (struct ip_conntrack_expect *) i; + if (exp->id <= *id) + continue; +@@ -1403,7 +1415,7 @@ ctnetlink_del_expect(struct sock *ctnl, + write_unlock_bh(&ip_conntrack_lock); + return -EINVAL; + } +- list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, ++ list_for_each_entry_safe(exp, tmp, &ve_ip_conntrack_expect_list, + list) { + if (exp->master->helper == h + && del_timer(&exp->timeout)) { +@@ -1415,7 +1427,7 @@ ctnetlink_del_expect(struct sock *ctnl, + } else { + /* This basically means we have to flush everything*/ + write_lock_bh(&ip_conntrack_lock); +- list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, ++ list_for_each_entry_safe(exp, tmp, &ve_ip_conntrack_expect_list, + list) { + if (del_timer(&exp->timeout)) { + ip_ct_unlink_expect(exp); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_proto_generic.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_proto_generic.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_proto_generic.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_proto_generic.c 2006-01-27 14:48:08.000000000 +0300 +@@ -52,7 +52,7 @@ static int packet(struct ip_conntrack *c + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) + { +- ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout); ++ ip_ct_refresh_acct(conntrack, ctinfo, skb, ve_ip_ct_generic_timeout); + return NF_ACCEPT; + } + +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_proto_icmp.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_proto_icmp.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_proto_icmp.c 2006-01-27 14:48:08.000000000 +0300 +@@ -104,7 +104,7 @@ static int icmp_packet(struct ip_conntra + } else { + atomic_inc(&ct->proto.icmp.count); + ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); +- ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); ++ ip_ct_refresh_acct(ct, ctinfo, skb, ve_ip_ct_icmp_timeout); + } + + return NF_ACCEPT; +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_proto_tcp.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_proto_tcp.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_proto_tcp.c 2006-01-27 14:48:08.000000000 +0300 +@@ -99,7 +99,7 @@ unsigned long ip_ct_tcp_timeout_close = + to ~13-30min depending on RTO. */ + unsigned long ip_ct_tcp_timeout_max_retrans = 5 MINS; + +-static const unsigned long * tcp_timeouts[] ++const unsigned long * tcp_timeouts[] + = { NULL, /* TCP_CONNTRACK_NONE */ + &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ + &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ +@@ -763,7 +763,7 @@ static int tcp_in_window(struct ip_ct_tc + : "SEQ is under the lower bound (already ACKed data retransmitted)" + : "SEQ is over the upper bound (over the window of the receiver)"); + +- res = ip_ct_tcp_be_liberal; ++ res = ve_ip_ct_tcp_be_liberal; + } + + DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " +@@ -1034,9 +1034,11 @@ static int tcp_packet(struct ip_conntrac + && (new_state == TCP_CONNTRACK_FIN_WAIT + || new_state == TCP_CONNTRACK_CLOSE)) + conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; +- timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans +- && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans +- ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; ++ timeout = conntrack->proto.tcp.retrans >= ve_ip_ct_tcp_max_retrans && ++ ve_ip_ct_tcp_timeouts[new_state] > ++ ve_ip_ct_tcp_timeout_max_retrans ++ ? ve_ip_ct_tcp_timeout_max_retrans : ++ ve_ip_ct_tcp_timeouts[new_state]; + write_unlock_bh(&tcp_lock); + + ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); +@@ -1111,7 +1113,7 @@ static int tcp_new(struct ip_conntrack * + conntrack->proto.tcp.seen[1].flags = 0; + conntrack->proto.tcp.seen[0].loose = + conntrack->proto.tcp.seen[1].loose = 0; +- } else if (ip_ct_tcp_loose == 0) { ++ } else if (ve_ip_ct_tcp_loose == 0) { + /* Don't try to pick up connections. */ + return 0; + } else { +@@ -1135,7 +1137,7 @@ static int tcp_new(struct ip_conntrack * + conntrack->proto.tcp.seen[0].flags = + conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM; + conntrack->proto.tcp.seen[0].loose = +- conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose; ++ conntrack->proto.tcp.seen[1].loose = ve_ip_ct_tcp_loose; + } + + conntrack->proto.tcp.seen[1].td_end = 0; +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_proto_udp.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_proto_udp.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_proto_udp.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_proto_udp.c 2006-01-27 14:48:08.000000000 +0300 +@@ -71,12 +71,12 @@ static int udp_packet(struct ip_conntrac + stream. Extend timeout. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + ip_ct_refresh_acct(conntrack, ctinfo, skb, +- ip_ct_udp_timeout_stream); ++ ve_ip_ct_udp_timeout_stream); + /* Also, more likely to be important, and not a probe */ + if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status)) + ip_conntrack_event_cache(IPCT_STATUS, skb); + } else +- ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); ++ ip_ct_refresh_acct(conntrack, ctinfo, skb, ve_ip_ct_udp_timeout); + + return NF_ACCEPT; + } +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_standalone.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_standalone.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ip_conntrack_standalone.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_conntrack_standalone.c 2006-01-27 14:48:08.000000000 +0300 +@@ -27,6 +27,7 @@ + #endif + #include <net/checksum.h> + #include <net/ip.h> ++#include <linux/nfcalls.h> + + #define ASSERT_READ_LOCK(x) + #define ASSERT_WRITE_LOCK(x) +@@ -45,7 +46,17 @@ + + MODULE_LICENSE("GPL"); + ++int ip_conntrack_disable_ve0 = 0; ++module_param(ip_conntrack_disable_ve0, int, 0440); ++ + extern atomic_t ip_conntrack_count; ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_ip_conntrack_count \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_count) ++#else ++#define ve_ip_conntrack_count ip_conntrack_count ++#endif + DECLARE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); + + static int kill_proto(struct ip_conntrack *i, void *data) +@@ -88,8 +99,8 @@ static struct list_head *ct_get_first(st + for (st->bucket = 0; + st->bucket < ip_conntrack_htable_size; + st->bucket++) { +- if (!list_empty(&ip_conntrack_hash[st->bucket])) +- return ip_conntrack_hash[st->bucket].next; ++ if (!list_empty(&ve_ip_conntrack_hash[st->bucket])) ++ return ve_ip_conntrack_hash[st->bucket].next; + } + return NULL; + } +@@ -99,10 +110,10 @@ static struct list_head *ct_get_next(str + struct ct_iter_state *st = seq->private; + + head = head->next; +- while (head == &ip_conntrack_hash[st->bucket]) { ++ while (head == &ve_ip_conntrack_hash[st->bucket]) { + if (++st->bucket >= ip_conntrack_htable_size) + return NULL; +- head = ip_conntrack_hash[st->bucket].next; ++ head = ve_ip_conntrack_hash[st->bucket].next; + } + return head; + } +@@ -233,7 +244,7 @@ static struct file_operations ct_file_op + /* expects */ + static void *exp_seq_start(struct seq_file *s, loff_t *pos) + { +- struct list_head *e = &ip_conntrack_expect_list; ++ struct list_head *e = &ve_ip_conntrack_expect_list; + loff_t i; + + /* strange seq_file api calls stop even if we fail, +@@ -245,7 +256,7 @@ static void *exp_seq_start(struct seq_fi + + for (i = 0; i <= *pos; i++) { + e = e->next; +- if (e == &ip_conntrack_expect_list) ++ if (e == &ve_ip_conntrack_expect_list) + return NULL; + } + return e; +@@ -258,7 +269,7 @@ static void *exp_seq_next(struct seq_fil + ++*pos; + e = e->next; + +- if (e == &ip_conntrack_expect_list) ++ if (e == &ve_ip_conntrack_expect_list) + return NULL; + + return e; +@@ -343,7 +354,7 @@ static void ct_cpu_seq_stop(struct seq_f + + static int ct_cpu_seq_show(struct seq_file *seq, void *v) + { +- unsigned int nr_conntracks = atomic_read(&ip_conntrack_count); ++ unsigned int nr_conntracks = atomic_read(&ve_ip_conntrack_count); + struct ip_conntrack_stat *st = v; + + if (v == SEQ_START_TOKEN) { +@@ -564,6 +575,28 @@ static struct nf_hook_ops ip_conntrack_l + + /* From ip_conntrack_core.c */ + extern int ip_conntrack_max; ++#ifdef CONFIG_VE_IPTABLES ++#define ve_ip_conntrack_max \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_max) ++#define ve_ip_ct_sysctl_header \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_sysctl_header) ++#define ve_ip_ct_net_table \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_net_table) ++#define ve_ip_ct_ipv4_table \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_ipv4_table) ++#define ve_ip_ct_netfilter_table \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_netfilter_table) ++#define ve_ip_ct_sysctl_table \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_sysctl_table) ++#else ++#define ve_ip_conntrack_max ip_conntrack_max ++static struct ctl_table_header *ip_ct_sysctl_header; ++#define ve_ip_ct_sysctl_header ip_ct_sysctl_header ++#define ve_ip_ct_net_table ip_ct_net_table ++#define ve_ip_ct_ipv4_table ip_ct_ipv4_table ++#define ve_ip_ct_netfilter_table ip_ct_netfilter_table ++#define ve_ip_ct_sysctl_table ip_ct_sysctl_table ++#endif + extern unsigned int ip_conntrack_htable_size; + + /* From ip_conntrack_proto_tcp.c */ +@@ -594,8 +627,6 @@ extern unsigned long ip_ct_generic_timeo + static int log_invalid_proto_min = 0; + static int log_invalid_proto_max = 255; + +-static struct ctl_table_header *ip_ct_sysctl_header; +- + static ctl_table ip_ct_sysctl_table[] = { + { + .ctl_name = NET_IPV4_NF_CONNTRACK_MAX, +@@ -804,6 +835,112 @@ static ctl_table ip_ct_net_table[] = { + }; + + EXPORT_SYMBOL(ip_ct_log_invalid); ++ ++#ifdef CONFIG_VE ++static void ip_conntrack_sysctl_cleanup(void) ++{ ++ if (!ve_is_super(get_exec_env())) { ++ kfree(ve_ip_ct_net_table); ++ kfree(ve_ip_ct_ipv4_table); ++ kfree(ve_ip_ct_netfilter_table); ++ kfree(ve_ip_ct_sysctl_table); ++ } ++ ve_ip_ct_net_table = NULL; ++ ve_ip_ct_ipv4_table = NULL; ++ ve_ip_ct_netfilter_table = NULL; ++ ve_ip_ct_sysctl_table = NULL; ++} ++ ++#define ALLOC_ENVCTL(field,k,label) \ ++ if ( !(field = kmalloc(k*sizeof(ctl_table), GFP_KERNEL)) ) \ ++ goto label; ++static int ip_conntrack_sysctl_init(void) ++{ ++ int i, ret = 0; ++ ++ ret = -ENOMEM; ++ if (ve_is_super(get_exec_env())) { ++ ve_ip_ct_net_table = ip_ct_net_table; ++ ve_ip_ct_ipv4_table = ip_ct_ipv4_table; ++ ve_ip_ct_netfilter_table = ip_ct_netfilter_table; ++ ve_ip_ct_sysctl_table = ip_ct_sysctl_table; ++ } else { ++ /* allocate structures in ve_struct */ ++ ALLOC_ENVCTL(ve_ip_ct_net_table, 2, out); ++ ALLOC_ENVCTL(ve_ip_ct_ipv4_table, 2, nomem_1); ++ ALLOC_ENVCTL(ve_ip_ct_netfilter_table, 3, nomem_2); ++ ALLOC_ENVCTL(ve_ip_ct_sysctl_table, 15, nomem_3); ++ ++ memcpy(ve_ip_ct_net_table, ip_ct_net_table, ++ 2*sizeof(ctl_table)); ++ memcpy(ve_ip_ct_ipv4_table, ip_ct_ipv4_table, ++ 2*sizeof(ctl_table)); ++ memcpy(ve_ip_ct_netfilter_table, ip_ct_netfilter_table, ++ 3*sizeof(ctl_table)); ++ memcpy(ve_ip_ct_sysctl_table, ip_ct_sysctl_table, ++ 21*sizeof(ctl_table)); ++ ++ ve_ip_ct_net_table[0].child = ve_ip_ct_ipv4_table; ++ ve_ip_ct_ipv4_table[0].child = ve_ip_ct_netfilter_table; ++ ve_ip_ct_netfilter_table[0].child = ve_ip_ct_sysctl_table; ++ } ++ ve_ip_ct_sysctl_table[0].data = &ve_ip_conntrack_max; ++ ve_ip_ct_netfilter_table[1].data = &ve_ip_conntrack_max; ++ ve_ip_ct_sysctl_table[1].data = &ve_ip_conntrack_count; ++ /* skip ve_ip_ct_sysctl_table[2].data as it is read-only and common ++ * for all environments */ ++ ve_ip_ct_tcp_timeouts[1] = ip_ct_tcp_timeout_syn_sent; ++ ve_ip_ct_sysctl_table[3].data = &ve_ip_ct_tcp_timeouts[1]; ++ ve_ip_ct_tcp_timeouts[2] = ip_ct_tcp_timeout_syn_recv; ++ ve_ip_ct_sysctl_table[4].data = &ve_ip_ct_tcp_timeouts[2]; ++ ve_ip_ct_tcp_timeouts[3] = ip_ct_tcp_timeout_established; ++ ve_ip_ct_sysctl_table[5].data = &ve_ip_ct_tcp_timeouts[3]; ++ ve_ip_ct_tcp_timeouts[4] = ip_ct_tcp_timeout_fin_wait; ++ ve_ip_ct_sysctl_table[6].data = &ve_ip_ct_tcp_timeouts[4]; ++ ve_ip_ct_tcp_timeouts[5] = ip_ct_tcp_timeout_close_wait; ++ ve_ip_ct_sysctl_table[7].data = &ve_ip_ct_tcp_timeouts[5]; ++ ve_ip_ct_tcp_timeouts[6] = ip_ct_tcp_timeout_last_ack; ++ ve_ip_ct_sysctl_table[8].data = &ve_ip_ct_tcp_timeouts[6]; ++ ve_ip_ct_tcp_timeouts[7] = ip_ct_tcp_timeout_time_wait; ++ ve_ip_ct_sysctl_table[9].data = &ve_ip_ct_tcp_timeouts[7]; ++ ve_ip_ct_tcp_timeouts[8] = ip_ct_tcp_timeout_close; ++ ve_ip_ct_sysctl_table[10].data = &ve_ip_ct_tcp_timeouts[8]; ++ ve_ip_ct_udp_timeout = ip_ct_udp_timeout; ++ ve_ip_ct_sysctl_table[11].data = &ve_ip_ct_udp_timeout; ++ ve_ip_ct_udp_timeout_stream = ip_ct_udp_timeout_stream; ++ ve_ip_ct_sysctl_table[12].data = &ve_ip_ct_udp_timeout_stream; ++ ve_ip_ct_icmp_timeout = ip_ct_icmp_timeout; ++ ve_ip_ct_sysctl_table[13].data = &ve_ip_ct_icmp_timeout; ++ ve_ip_ct_generic_timeout = ip_ct_generic_timeout; ++ ve_ip_ct_sysctl_table[14].data = &ve_ip_ct_generic_timeout; ++ ve_ip_ct_log_invalid = ip_ct_log_invalid; ++ ve_ip_ct_sysctl_table[15].data = &ve_ip_ct_log_invalid; ++ ve_ip_ct_tcp_timeout_max_retrans = ip_ct_tcp_timeout_max_retrans; ++ ve_ip_ct_sysctl_table[16].data = &ve_ip_ct_tcp_timeout_max_retrans; ++ ve_ip_ct_tcp_loose = ip_ct_tcp_loose; ++ ve_ip_ct_sysctl_table[17].data = &ve_ip_ct_tcp_loose; ++ ve_ip_ct_tcp_be_liberal = ip_ct_tcp_be_liberal; ++ ve_ip_ct_sysctl_table[18].data = &ve_ip_ct_tcp_be_liberal; ++ ve_ip_ct_tcp_max_retrans = ip_ct_tcp_max_retrans; ++ ve_ip_ct_sysctl_table[19].data = &ve_ip_ct_tcp_max_retrans; ++ for (i = 0; i < 20; i++) ++ ve_ip_ct_sysctl_table[i].owner_env = get_exec_env(); ++ ve_ip_ct_netfilter_table[1].owner_env = get_exec_env(); ++ return 0; ++ ++nomem_3: ++ kfree(ve_ip_ct_netfilter_table); ++ ve_ip_ct_netfilter_table = NULL; ++nomem_2: ++ kfree(ve_ip_ct_ipv4_table); ++ ve_ip_ct_ipv4_table = NULL; ++nomem_1: ++ kfree(ve_ip_ct_net_table); ++ ve_ip_ct_net_table = NULL; ++out: ++ return ret; ++} ++#endif /*CONFIG_VE*/ + #endif /* CONFIG_SYSCTL */ + + static int init_or_cleanup(int init) +@@ -815,9 +952,16 @@ static int init_or_cleanup(int init) + + if (!init) goto cleanup; + ++ ret = -ENOENT; ++ if (!ve_is_super(get_exec_env())) ++ __module_get(THIS_MODULE); ++ + ret = ip_conntrack_init(); + if (ret < 0) +- goto cleanup_nothing; ++ goto cleanup_unget; ++ ++ if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0) ++ return 0; + + #ifdef CONFIG_PROC_FS + ret = -ENOMEM; +@@ -827,98 +971,115 @@ static int init_or_cleanup(int init) + proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440, + &exp_file_ops); + if (!proc_exp) goto cleanup_proc; ++ proc_exp->proc_fops = &exp_file_ops; + +- proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat); +- if (!proc_stat) +- goto cleanup_proc_exp; ++ if (ve_is_super(get_exec_env())) { ++ proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat); ++ if (!proc_stat) ++ goto cleanup_proc_exp; + +- proc_stat->proc_fops = &ct_cpu_seq_fops; +- proc_stat->owner = THIS_MODULE; ++ proc_stat->proc_fops = &ct_cpu_seq_fops; ++ proc_stat->owner = THIS_MODULE; ++ } + #endif + +- ret = nf_register_hook(&ip_conntrack_defrag_ops); ++ ret = virt_nf_register_hook(&ip_conntrack_defrag_ops); + if (ret < 0) { + printk("ip_conntrack: can't register pre-routing defrag hook.\n"); + goto cleanup_proc_stat; + } +- ret = nf_register_hook(&ip_conntrack_defrag_local_out_ops); ++ ret = virt_nf_register_hook(&ip_conntrack_defrag_local_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local_out defrag hook.\n"); + goto cleanup_defragops; + } +- ret = nf_register_hook(&ip_conntrack_in_ops); ++ ret = virt_nf_register_hook(&ip_conntrack_in_ops); + if (ret < 0) { + printk("ip_conntrack: can't register pre-routing hook.\n"); + goto cleanup_defraglocalops; + } +- ret = nf_register_hook(&ip_conntrack_local_out_ops); ++ ret = virt_nf_register_hook(&ip_conntrack_local_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local out hook.\n"); + goto cleanup_inops; + } +- ret = nf_register_hook(&ip_conntrack_helper_in_ops); ++ ret = virt_nf_register_hook(&ip_conntrack_helper_in_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local in helper hook.\n"); + goto cleanup_inandlocalops; + } +- ret = nf_register_hook(&ip_conntrack_helper_out_ops); ++ ret = virt_nf_register_hook(&ip_conntrack_helper_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register postrouting helper hook.\n"); + goto cleanup_helperinops; + } +- ret = nf_register_hook(&ip_conntrack_out_ops); ++ ret = virt_nf_register_hook(&ip_conntrack_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register post-routing hook.\n"); + goto cleanup_helperoutops; + } +- ret = nf_register_hook(&ip_conntrack_local_in_ops); ++ ret = virt_nf_register_hook(&ip_conntrack_local_in_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local in hook.\n"); + goto cleanup_inoutandlocalops; + } + #ifdef CONFIG_SYSCTL +- ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0); +- if (ip_ct_sysctl_header == NULL) { ++#ifdef CONFIG_VE ++ ret = ip_conntrack_sysctl_init(); ++ if (ret < 0) ++ goto cleanup_sysctl; ++#endif ++ ret = -ENOMEM; ++ ve_ip_ct_sysctl_header = register_sysctl_table(ve_ip_ct_net_table, 0); ++ if (ve_ip_ct_sysctl_header == NULL) { + printk("ip_conntrack: can't register to sysctl.\n"); +- ret = -ENOMEM; +- goto cleanup_localinops; ++ goto cleanup_sysctl2; + } + #endif + +- return ret; ++ return 0; + + cleanup: ++ if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0) ++ goto cleanup_init; + synchronize_net(); + #ifdef CONFIG_SYSCTL +- unregister_sysctl_table(ip_ct_sysctl_header); +- cleanup_localinops: ++ unregister_sysctl_table(ve_ip_ct_sysctl_header); ++ cleanup_sysctl2: ++#ifdef CONFIG_VE ++ ip_conntrack_sysctl_cleanup(); ++ cleanup_sysctl: + #endif +- nf_unregister_hook(&ip_conntrack_local_in_ops); ++#endif ++ virt_nf_unregister_hook(&ip_conntrack_local_in_ops); + cleanup_inoutandlocalops: +- nf_unregister_hook(&ip_conntrack_out_ops); ++ virt_nf_unregister_hook(&ip_conntrack_out_ops); + cleanup_helperoutops: +- nf_unregister_hook(&ip_conntrack_helper_out_ops); ++ virt_nf_unregister_hook(&ip_conntrack_helper_out_ops); + cleanup_helperinops: +- nf_unregister_hook(&ip_conntrack_helper_in_ops); ++ virt_nf_unregister_hook(&ip_conntrack_helper_in_ops); + cleanup_inandlocalops: +- nf_unregister_hook(&ip_conntrack_local_out_ops); ++ virt_nf_unregister_hook(&ip_conntrack_local_out_ops); + cleanup_inops: +- nf_unregister_hook(&ip_conntrack_in_ops); ++ virt_nf_unregister_hook(&ip_conntrack_in_ops); + cleanup_defraglocalops: +- nf_unregister_hook(&ip_conntrack_defrag_local_out_ops); ++ virt_nf_unregister_hook(&ip_conntrack_defrag_local_out_ops); + cleanup_defragops: +- nf_unregister_hook(&ip_conntrack_defrag_ops); ++ virt_nf_unregister_hook(&ip_conntrack_defrag_ops); + cleanup_proc_stat: + #ifdef CONFIG_PROC_FS +- remove_proc_entry("ip_conntrack", proc_net_stat); ++ if (ve_is_super(get_exec_env())) ++ remove_proc_entry("ip_conntrack", proc_net_stat); + cleanup_proc_exp: + proc_net_remove("ip_conntrack_expect"); + cleanup_proc: + proc_net_remove("ip_conntrack"); +- cleanup_init: + #endif /* CONFIG_PROC_FS */ ++ cleanup_init: + ip_conntrack_cleanup(); +- cleanup_nothing: ++ cleanup_unget: ++ if (!ve_is_super(get_exec_env())) ++ module_put(THIS_MODULE); + return ret; + } + +@@ -929,11 +1090,11 @@ int ip_conntrack_protocol_register(struc + int ret = 0; + + write_lock_bh(&ip_conntrack_lock); +- if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { ++ if (ve_ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { + ret = -EBUSY; + goto out; + } +- ip_ct_protos[proto->proto] = proto; ++ ve_ip_ct_protos[proto->proto] = proto; + out: + write_unlock_bh(&ip_conntrack_lock); + return ret; +@@ -942,7 +1103,7 @@ int ip_conntrack_protocol_register(struc + void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto) + { + write_lock_bh(&ip_conntrack_lock); +- ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol; ++ ve_ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol; + write_unlock_bh(&ip_conntrack_lock); + + /* Somebody could be still looking at the proto in bh. */ +@@ -952,17 +1113,39 @@ void ip_conntrack_protocol_unregister(st + ip_ct_iterate_cleanup(kill_proto, &proto->proto); + } + +-static int __init init(void) ++int init_iptable_conntrack(void) + { + return init_or_cleanup(1); + } + +-static void __exit fini(void) ++void fini_iptable_conntrack(void) + { + init_or_cleanup(0); + } + +-module_init(init); ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_iptable_conntrack(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_conntrack); ++ KSYMRESOLVE(fini_iptable_conntrack); ++ KSYMMODRESOLVE(ip_conntrack); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ip_conntrack); ++ KSYMUNRESOLVE(init_iptable_conntrack); ++ KSYMUNRESOLVE(fini_iptable_conntrack); ++ fini_iptable_conntrack(); ++} ++ ++subsys_initcall(init); + module_exit(fini); + + /* Some modules need us, but don't depend directly on any symbol. +@@ -979,15 +1162,20 @@ EXPORT_SYMBOL_GPL(ip_conntrack_unregiste + EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init); + EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache); + #endif ++EXPORT_SYMBOL(ip_conntrack_disable_ve0); + EXPORT_SYMBOL(ip_conntrack_protocol_register); + EXPORT_SYMBOL(ip_conntrack_protocol_unregister); + EXPORT_SYMBOL(ip_ct_get_tuple); + EXPORT_SYMBOL(invert_tuplepr); + EXPORT_SYMBOL(ip_conntrack_alter_reply); ++#ifndef CONFIG_VE_IPTABLES + EXPORT_SYMBOL(ip_conntrack_destroyed); ++#endif + EXPORT_SYMBOL(need_ip_conntrack); + EXPORT_SYMBOL(ip_conntrack_helper_register); + EXPORT_SYMBOL(ip_conntrack_helper_unregister); ++EXPORT_SYMBOL(virt_ip_conntrack_helper_register); ++EXPORT_SYMBOL(virt_ip_conntrack_helper_unregister); + EXPORT_SYMBOL(ip_ct_iterate_cleanup); + EXPORT_SYMBOL(__ip_ct_refresh_acct); + +@@ -997,14 +1185,18 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_expect_ + EXPORT_SYMBOL_GPL(ip_conntrack_expect_find); + EXPORT_SYMBOL(ip_conntrack_expect_related); + EXPORT_SYMBOL(ip_conntrack_unexpect_related); ++#ifndef CONFIG_VE_IPTABLES + EXPORT_SYMBOL_GPL(ip_conntrack_expect_list); ++#endif + EXPORT_SYMBOL_GPL(ip_ct_unlink_expect); + + EXPORT_SYMBOL(ip_conntrack_tuple_taken); + EXPORT_SYMBOL(ip_ct_gather_frags); + EXPORT_SYMBOL(ip_conntrack_htable_size); + EXPORT_SYMBOL(ip_conntrack_lock); ++#ifndef CONFIG_VE_IPTABLES + EXPORT_SYMBOL(ip_conntrack_hash); ++#endif + EXPORT_SYMBOL(ip_conntrack_untracked); + EXPORT_SYMBOL_GPL(ip_conntrack_find_get); + #ifdef CONFIG_IP_NF_NAT_NEEDED +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ip_nat_core.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_nat_core.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ip_nat_core.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_nat_core.c 2006-01-27 14:48:08.000000000 +0300 +@@ -21,6 +21,8 @@ + #include <linux/icmp.h> + #include <linux/udp.h> + #include <linux/jhash.h> ++#include <linux/nfcalls.h> ++#include <ub/ub_mem.h> + + #define ASSERT_READ_LOCK(x) + #define ASSERT_WRITE_LOCK(x) +@@ -46,15 +48,24 @@ DEFINE_RWLOCK(ip_nat_lock); + /* Calculated at init based on memory size */ + static unsigned int ip_nat_htable_size; + +-static struct list_head *bysource; +- + #define MAX_IP_NAT_PROTO 256 ++ ++#ifdef CONFIG_VE_IPTABLES ++#define ve_ip_nat_bysource \ ++ (get_exec_env()->_ip_conntrack->_ip_nat_bysource) ++#define ve_ip_nat_protos \ ++ (get_exec_env()->_ip_conntrack->_ip_nat_protos) ++#else ++static struct list_head *bysource; ++#define ve_ip_nat_bysource bysource + static struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; ++#define ve_ip_nat_protos ip_nat_protos ++#endif + + static inline struct ip_nat_protocol * + __ip_nat_proto_find(u_int8_t protonum) + { +- return ip_nat_protos[protonum]; ++ return ve_ip_nat_protos[protonum]; + } + + struct ip_nat_protocol * +@@ -177,7 +188,7 @@ find_appropriate_src(const struct ip_con + struct ip_conntrack *ct; + + read_lock_bh(&ip_nat_lock); +- list_for_each_entry(ct, &bysource[h], nat.info.bysource) { ++ list_for_each_entry(ct, &ve_ip_nat_bysource[h], nat.info.bysource) { + if (same_src(ct, tuple)) { + /* Copy source part from reply tuple. */ + invert_tuplepr(result, +@@ -337,7 +348,7 @@ ip_nat_setup_info(struct ip_conntrack *c + = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple); + write_lock_bh(&ip_nat_lock); +- list_add(&info->bysource, &bysource[srchash]); ++ list_add(&info->bysource, &ve_ip_nat_bysource[srchash]); + write_unlock_bh(&ip_nat_lock); + } + +@@ -521,11 +532,11 @@ int ip_nat_protocol_register(struct ip_n + int ret = 0; + + write_lock_bh(&ip_nat_lock); +- if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { ++ if (ve_ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { + ret = -EBUSY; + goto out; + } +- ip_nat_protos[proto->protonum] = proto; ++ ve_ip_nat_protos[proto->protonum] = proto; + out: + write_unlock_bh(&ip_nat_lock); + return ret; +@@ -536,7 +547,7 @@ EXPORT_SYMBOL(ip_nat_protocol_register); + void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) + { + write_lock_bh(&ip_nat_lock); +- ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol; ++ ve_ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol; + write_unlock_bh(&ip_nat_lock); + + /* Someone could be still looking at the proto in a bh. */ +@@ -589,38 +600,55 @@ EXPORT_SYMBOL_GPL(ip_nat_port_nfattr_to_ + EXPORT_SYMBOL_GPL(ip_nat_port_range_to_nfattr); + #endif + +-static int __init ip_nat_init(void) ++static int ip_nat_init(void) + { + size_t i; ++ int ret; + +- /* Leave them the same for the moment. */ +- ip_nat_htable_size = ip_conntrack_htable_size; ++ if (ve_is_super(get_exec_env())) ++ ip_nat_htable_size = ip_conntrack_htable_size; + + /* One vmalloc for both hash tables */ +- bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size); +- if (!bysource) +- return -ENOMEM; ++ ret = -ENOMEM; ++ ve_ip_nat_bysource = ++ ub_vmalloc(sizeof(struct list_head)*ip_nat_htable_size*2); ++ if (!ve_ip_nat_bysource) ++ goto nomem; ++ ++#ifdef CONFIG_VE_IPTABLES ++ ve_ip_nat_protos = ++ ub_kmalloc(sizeof(void *)*MAX_IP_NAT_PROTO, GFP_KERNEL); ++ if (!ve_ip_nat_protos) ++ goto nomem2; ++#endif + + /* Sew in builtin protocols. */ + write_lock_bh(&ip_nat_lock); + for (i = 0; i < MAX_IP_NAT_PROTO; i++) +- ip_nat_protos[i] = &ip_nat_unknown_protocol; +- ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp; +- ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp; +- ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp; ++ ve_ip_nat_protos[i] = &ip_nat_unknown_protocol; ++ ve_ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp; ++ ve_ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp; ++ ve_ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp; + write_unlock_bh(&ip_nat_lock); + + for (i = 0; i < ip_nat_htable_size; i++) { +- INIT_LIST_HEAD(&bysource[i]); ++ INIT_LIST_HEAD(&ve_ip_nat_bysource[i]); + } + + /* FIXME: Man, this is a hack. <SIGH> */ + IP_NF_ASSERT(ip_conntrack_destroyed == NULL); +- ip_conntrack_destroyed = &ip_nat_cleanup_conntrack; ++ ve_ip_conntrack_destroyed = &ip_nat_cleanup_conntrack; + +- /* Initialize fake conntrack so that NAT will skip it */ +- ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK; ++ if (ve_is_super(get_exec_env())) ++ /* Initialize fake conntrack so that NAT will skip it */ ++ ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK; + return 0; ++#ifdef CONFIG_VE_IPTABLES ++nomem2: ++#endif ++ vfree(ve_ip_nat_bysource); ++nomem: ++ return ret; + } + + /* Clear NAT section of all conntracks, in case we're loaded again. */ +@@ -631,14 +659,41 @@ static int clean_nat(struct ip_conntrack + return 0; + } + +-static void __exit ip_nat_cleanup(void) ++static void ip_nat_cleanup(void) + { + ip_ct_iterate_cleanup(&clean_nat, NULL); +- ip_conntrack_destroyed = NULL; +- vfree(bysource); ++ ve_ip_conntrack_destroyed = NULL; ++ vfree(ve_ip_nat_bysource); ++ ve_ip_nat_bysource = NULL; ++#ifdef CONFIG_VE_IPTABLES ++ kfree(ve_ip_nat_protos); ++ ve_ip_nat_protos = NULL; ++#endif ++} ++ ++static int __init init(void) ++{ ++ int err; ++ ++ err = ip_nat_init(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(ip_nat_init); ++ KSYMRESOLVE(ip_nat_cleanup); ++ KSYMMODRESOLVE(ip_nat); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ip_nat); ++ KSYMUNRESOLVE(ip_nat_cleanup); ++ KSYMUNRESOLVE(ip_nat_init); ++ ip_nat_cleanup(); + } + + MODULE_LICENSE("GPL"); + +-module_init(ip_nat_init); +-module_exit(ip_nat_cleanup); ++fs_initcall(init); ++module_exit(fini); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ip_nat_ftp.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_nat_ftp.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ip_nat_ftp.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_nat_ftp.c 2006-01-27 14:48:08.000000000 +0300 +@@ -19,6 +19,7 @@ + #include <linux/netfilter_ipv4/ip_nat_rule.h> + #include <linux/netfilter_ipv4/ip_conntrack_ftp.h> + #include <linux/netfilter_ipv4/ip_conntrack_helper.h> ++#include <linux/nfcalls.h> + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); +@@ -154,18 +155,43 @@ static unsigned int ip_nat_ftp(struct sk + return NF_ACCEPT; + } + +-static void __exit fini(void) ++#ifdef CONFIG_VE_IPTABLES ++#undef ve_ip_nat_ftp_hook ++#define ve_ip_nat_ftp_hook \ ++ (get_exec_env()->_ip_conntrack->_ip_nat_ftp_hook) ++#endif ++int init_iptable_nat_ftp(void) + { +- ip_nat_ftp_hook = NULL; ++ BUG_ON(ve_ip_nat_ftp_hook); ++#ifdef CONFIG_VE_IPTABLES ++ ve_ip_nat_ftp_hook = (ip_nat_helper_func)ip_nat_ftp; ++#else ++ ve_ip_nat_ftp_hook = ip_nat_ftp; ++#endif ++ return 0; ++} ++ ++void fini_iptable_nat_ftp(void) ++{ ++ ve_ip_nat_ftp_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); + } + ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ip_nat_ftp); ++ KSYMUNRESOLVE(init_iptable_nat_ftp); ++ KSYMUNRESOLVE(fini_iptable_nat_ftp); ++ fini_iptable_nat_ftp(); ++} ++ + static int __init init(void) + { +- BUG_ON(ip_nat_ftp_hook); +- ip_nat_ftp_hook = ip_nat_ftp; +- return 0; ++ KSYMRESOLVE(init_iptable_nat_ftp); ++ KSYMRESOLVE(fini_iptable_nat_ftp); ++ KSYMMODRESOLVE(ip_nat_ftp); ++ return init_iptable_nat_ftp(); + } + + /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ip_nat_irc.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_nat_irc.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ip_nat_irc.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_nat_irc.c 2006-01-27 14:48:08.000000000 +0300 +@@ -23,6 +23,7 @@ + #include <linux/netfilter_ipv4/ip_conntrack_irc.h> + #include <linux/netfilter_ipv4/ip_conntrack_helper.h> + #include <linux/moduleparam.h> ++#include <linux/nfcalls.h> + + #if 0 + #define DEBUGP printk +@@ -96,18 +97,44 @@ static unsigned int help(struct sk_buff + return ret; + } + +-static void __exit fini(void) ++#ifdef CONFIG_VE_IPTABLES ++#undef ve_ip_nat_irc_hook ++#define ve_ip_nat_irc_hook \ ++ (get_exec_env()->_ip_conntrack->_ip_nat_irc_hook) ++#endif ++ ++int init_iptable_nat_irc(void) ++{ ++ BUG_ON(ve_ip_nat_irc_hook); ++#ifdef CONFIG_VE_IPTABLES ++ ve_ip_nat_irc_hook = (ip_nat_helper_func)help; ++#else ++ ve_ip_nat_irc_hook = help; ++#endif ++ return 0; ++} ++ ++void fini_iptable_nat_irc(void) + { +- ip_nat_irc_hook = NULL; ++ ve_ip_nat_irc_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); + } + ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ip_nat_irc); ++ KSYMUNRESOLVE(init_iptable_nat_irc); ++ KSYMUNRESOLVE(fini_iptable_nat_irc); ++ fini_iptable_nat_irc(); ++} ++ + static int __init init(void) + { +- BUG_ON(ip_nat_irc_hook); +- ip_nat_irc_hook = help; +- return 0; ++ KSYMRESOLVE(init_iptable_nat_irc); ++ KSYMRESOLVE(fini_iptable_nat_irc); ++ KSYMMODRESOLVE(ip_nat_irc); ++ return init_iptable_nat_irc(); + } + + /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ip_nat_rule.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_nat_rule.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ip_nat_rule.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_nat_rule.c 2006-01-27 14:48:08.000000000 +0300 +@@ -34,6 +34,13 @@ + #define DEBUGP(format, args...) + #endif + ++#ifdef CONFIG_VE_IPTABLES ++#define ve_ip_nat_table \ ++ (get_exec_env()->_ip_conntrack->_ip_nat_table) ++#else ++#define ve_ip_nat_table &nat_table ++#endif ++ + #define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT)) + + static struct +@@ -41,7 +48,7 @@ static struct + struct ipt_replace repl; + struct ipt_standard entries[3]; + struct ipt_error term; +-} nat_initial_table __initdata ++} nat_initial_table + = { { "nat", NAT_VALID_HOOKS, 4, + sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), + { [NF_IP_PRE_ROUTING] = 0, +@@ -285,7 +292,7 @@ int ip_nat_rule_find(struct sk_buff **ps + { + int ret; + +- ret = ipt_do_table(pskb, hooknum, in, out, &nat_table, NULL); ++ ret = ipt_do_table(pskb, hooknum, in, out, ve_ip_nat_table, NULL); + + if (ret == NF_ACCEPT) { + if (!ip_nat_initialized(ct, HOOK2MANIP(hooknum))) +@@ -307,34 +314,46 @@ static struct ipt_target ipt_dnat_reg = + .checkentry = ipt_dnat_checkentry, + }; + +-int __init ip_nat_rule_init(void) ++int ip_nat_rule_init(void) + { + int ret; ++ struct ipt_table *tmp_table; + +- ret = ipt_register_table(&nat_table, &nat_initial_table.repl); +- if (ret != 0) +- return ret; +- ret = ipt_register_target(&ipt_snat_reg); ++ tmp_table = virt_ipt_register_table(&nat_table, ++ &nat_initial_table.repl); ++ if (IS_ERR(tmp_table)) ++ return PTR_ERR(tmp_table); ++#ifdef CONFIG_VE_IPTABLES ++ ve_ip_nat_table = tmp_table; ++#endif ++ ++ ret = virt_ipt_register_target(&ipt_snat_reg); + if (ret != 0) + goto unregister_table; + +- ret = ipt_register_target(&ipt_dnat_reg); ++ ret = virt_ipt_register_target(&ipt_dnat_reg); + if (ret != 0) + goto unregister_snat; + + return ret; + + unregister_snat: +- ipt_unregister_target(&ipt_snat_reg); ++ virt_ipt_unregister_target(&ipt_snat_reg); + unregister_table: +- ipt_unregister_table(&nat_table); ++ virt_ipt_unregister_table(ve_ip_nat_table); ++#ifdef CONFIG_VE_IPTABLES ++ ve_ip_nat_table = NULL; ++#endif + + return ret; + } + + void ip_nat_rule_cleanup(void) + { +- ipt_unregister_target(&ipt_dnat_reg); +- ipt_unregister_target(&ipt_snat_reg); +- ipt_unregister_table(&nat_table); ++ virt_ipt_unregister_target(&ipt_dnat_reg); ++ virt_ipt_unregister_target(&ipt_snat_reg); ++ virt_ipt_unregister_table(ve_ip_nat_table); ++#ifdef CONFIG_VE_IPTABLES ++ ve_ip_nat_table = NULL; ++#endif + } +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ip_nat_standalone.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_nat_standalone.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ip_nat_standalone.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_nat_standalone.c 2006-01-27 14:48:08.000000000 +0300 +@@ -30,6 +30,7 @@ + #include <net/ip.h> + #include <net/checksum.h> + #include <linux/spinlock.h> ++#include <linux/nfcalls.h> + + #define ASSERT_READ_LOCK(x) + #define ASSERT_WRITE_LOCK(x) +@@ -320,37 +321,43 @@ static int init_or_cleanup(int init) + + if (!init) goto cleanup; + ++ if (!ve_is_super(get_exec_env())) ++ __module_get(THIS_MODULE); ++ + ret = ip_nat_rule_init(); + if (ret < 0) { + printk("ip_nat_init: can't setup rules.\n"); +- goto cleanup_nothing; ++ goto cleanup_modput; + } +- ret = nf_register_hook(&ip_nat_in_ops); ++ if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0) ++ return 0; ++ ++ ret = virt_nf_register_hook(&ip_nat_in_ops); + if (ret < 0) { + printk("ip_nat_init: can't register in hook.\n"); + goto cleanup_rule_init; + } +- ret = nf_register_hook(&ip_nat_out_ops); ++ ret = virt_nf_register_hook(&ip_nat_out_ops); + if (ret < 0) { + printk("ip_nat_init: can't register out hook.\n"); + goto cleanup_inops; + } +- ret = nf_register_hook(&ip_nat_adjust_in_ops); ++ ret = virt_nf_register_hook(&ip_nat_adjust_in_ops); + if (ret < 0) { + printk("ip_nat_init: can't register adjust in hook.\n"); + goto cleanup_outops; + } +- ret = nf_register_hook(&ip_nat_adjust_out_ops); ++ ret = virt_nf_register_hook(&ip_nat_adjust_out_ops); + if (ret < 0) { + printk("ip_nat_init: can't register adjust out hook.\n"); + goto cleanup_adjustin_ops; + } +- ret = nf_register_hook(&ip_nat_local_out_ops); ++ ret = virt_nf_register_hook(&ip_nat_local_out_ops); + if (ret < 0) { + printk("ip_nat_init: can't register local out hook.\n"); + goto cleanup_adjustout_ops;; + } +- ret = nf_register_hook(&ip_nat_local_in_ops); ++ ret = virt_nf_register_hook(&ip_nat_local_in_ops); + if (ret < 0) { + printk("ip_nat_init: can't register local in hook.\n"); + goto cleanup_localoutops; +@@ -358,34 +365,60 @@ static int init_or_cleanup(int init) + return ret; + + cleanup: +- nf_unregister_hook(&ip_nat_local_in_ops); ++ if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0) ++ goto cleanup_rule_init; ++ virt_nf_unregister_hook(&ip_nat_local_in_ops); + cleanup_localoutops: +- nf_unregister_hook(&ip_nat_local_out_ops); ++ virt_nf_unregister_hook(&ip_nat_local_out_ops); + cleanup_adjustout_ops: +- nf_unregister_hook(&ip_nat_adjust_out_ops); ++ virt_nf_unregister_hook(&ip_nat_adjust_out_ops); + cleanup_adjustin_ops: +- nf_unregister_hook(&ip_nat_adjust_in_ops); ++ virt_nf_unregister_hook(&ip_nat_adjust_in_ops); + cleanup_outops: +- nf_unregister_hook(&ip_nat_out_ops); ++ virt_nf_unregister_hook(&ip_nat_out_ops); + cleanup_inops: +- nf_unregister_hook(&ip_nat_in_ops); ++ virt_nf_unregister_hook(&ip_nat_in_ops); + cleanup_rule_init: + ip_nat_rule_cleanup(); +- cleanup_nothing: ++ cleanup_modput: ++ if (!ve_is_super(get_exec_env())) ++ module_put(THIS_MODULE); + return ret; + } + +-static int __init init(void) ++int init_iptable_nat(void) + { + return init_or_cleanup(1); + } + +-static void __exit fini(void) ++void fini_iptable_nat(void) + { + init_or_cleanup(0); + } + +-module_init(init); ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_iptable_nat(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_nat); ++ KSYMRESOLVE(fini_iptable_nat); ++ KSYMMODRESOLVE(iptable_nat); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(iptable_nat); ++ KSYMUNRESOLVE(init_iptable_nat); ++ KSYMUNRESOLVE(fini_iptable_nat); ++ fini_iptable_nat(); ++} ++ ++fs_initcall(init); + module_exit(fini); + + MODULE_LICENSE("GPL"); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ip_queue.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_queue.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ip_queue.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_queue.c 2006-01-27 14:48:08.000000000 +0300 +@@ -542,8 +542,17 @@ ipq_rcv_sk(struct sock *sk, int len) + down(&ipqnl_sem); + + for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { ++#ifdef CONFIG_VE ++ struct ve_struct *env; ++#endif + skb = skb_dequeue(&sk->sk_receive_queue); ++#ifdef CONFIG_VE ++ env = set_exec_env(VE_OWNER_SKB(skb)); + ipq_rcv_skb(skb); ++ (void)set_exec_env(env); ++#else ++ ipq_rcv_skb(skb); ++#endif + kfree_skb(skb); + } + +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ip_tables.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_tables.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ip_tables.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ip_tables.c 2006-01-27 14:48:08.000000000 +0300 +@@ -28,9 +28,13 @@ + #include <linux/proc_fs.h> + #include <linux/err.h> + #include <linux/cpumask.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter_ipv4/ip_tables.h> + ++#include <ub/beancounter.h> ++#include <ub/ub_mem.h> ++ + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); + MODULE_DESCRIPTION("IPv4 packet filter"); +@@ -111,6 +115,52 @@ struct ipt_table_info + static LIST_HEAD(ipt_target); + static LIST_HEAD(ipt_match); + static LIST_HEAD(ipt_tables); ++ ++#ifdef CONFIG_VE_IPTABLES ++/* include ve.h and define get_exec_env */ ++#include <linux/sched.h> ++ ++int init_iptables(void); ++ ++#define ve_ipt_target (*(get_exec_env()->_ipt_target)) ++#define ve_ipt_match (*(get_exec_env()->_ipt_match)) ++#define ve_ipt_tables (*(get_exec_env()->_ipt_tables)) ++#define ve_ipt_standard_target (*(get_exec_env()->_ipt_standard_target)) ++#define ve_ipt_error_target (*(get_exec_env()->_ipt_error_target)) ++#define ve_tcp_matchstruct (*(get_exec_env()->_tcp_matchstruct)) ++#define ve_udp_matchstruct (*(get_exec_env()->_udp_matchstruct)) ++#define ve_icmp_matchstruct (*(get_exec_env()->_icmp_matchstruct)) ++ ++ ++#ifdef CONFIG_USER_RESOURCE ++#define UB_NUMIPTENT 23 ++static int charge_iptables(struct user_beancounter *ub, unsigned long size) ++{ ++ if (ub == NULL) ++ return 0; ++ return charge_beancounter(ub, UB_NUMIPTENT, size, 1); ++} ++static void uncharge_iptables(struct user_beancounter *ub, unsigned long size) ++{ ++ if (ub == NULL) ++ return; ++ uncharge_beancounter(ub, UB_NUMIPTENT, size); ++} ++#endif /* CONFIG_USER_RESOURCE */ ++ ++#else /* CONFIG_VE_IPTABLES */ ++ ++#define ve_ipt_target ipt_target ++#define ve_ipt_match ipt_match ++#define ve_ipt_tables ipt_tables ++#define ve_ipt_standard_target ipt_standard_target ++#define ve_ipt_error_target ipt_error_target ++#define ve_tcp_matchstruct tcp_matchstruct ++#define ve_udp_matchstruct udp_matchstruct ++#define ve_icmp_matchstruct icmp_matchstruct ++ ++#endif /* CONFIG_VE_IPTABLES */ ++ + #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) + + #ifdef CONFIG_SMP +@@ -422,7 +472,7 @@ static inline struct ipt_table *find_tab + if (down_interruptible(&ipt_mutex) != 0) + return ERR_PTR(-EINTR); + +- list_for_each_entry(t, &ipt_tables, list) ++ list_for_each_entry(t, &ve_ipt_tables, list) + if (strcmp(t->name, name) == 0 && try_module_get(t->me)) + return t; + up(&ipt_mutex); +@@ -462,7 +512,7 @@ static inline struct ipt_target *find_ta + if (down_interruptible(&ipt_mutex) != 0) + return ERR_PTR(-EINTR); + +- list_for_each_entry(t, &ipt_target, list) { ++ list_for_each_entry(t, &ve_ipt_target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision == revision) { + if (try_module_get(t->me)) { +@@ -493,7 +543,7 @@ static int match_revfn(const char *name, + struct ipt_match *m; + int have_rev = 0; + +- list_for_each_entry(m, &ipt_match, list) { ++ list_for_each_entry(m, &ve_ipt_match, list) { + if (strcmp(m->name, name) == 0) { + if (m->revision > *bestp) + *bestp = m->revision; +@@ -509,7 +559,7 @@ static int target_revfn(const char *name + struct ipt_target *t; + int have_rev = 0; + +- list_for_each_entry(t, &ipt_target, list) { ++ list_for_each_entry(t, &ve_ipt_target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision > *bestp) + *bestp = t->revision; +@@ -585,7 +635,7 @@ mark_source_chains(struct ipt_table_info + = (void *)ipt_get_target(e); + + if (e->comefrom & (1 << NF_IP_NUMHOOKS)) { +- printk("iptables: loop hook %u pos %u %08X.\n", ++ ve_printk(VE_LOG, "iptables: loop hook %u pos %u %08X.\n", + hook, pos, e->comefrom); + return 0; + } +@@ -763,7 +813,7 @@ check_entry(struct ipt_entry *e, const c + } + t->u.kernel.target = target; + +- if (t->u.kernel.target == &ipt_standard_target) { ++ if (t->u.kernel.target == &ve_ipt_standard_target) { + if (!standard_check(t, size)) { + ret = -EINVAL; + goto cleanup_matches; +@@ -933,6 +983,69 @@ translate_table(const char *name, + return ret; + } + ++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_USER_RESOURCE) ++static int charge_replace_table(struct ipt_table_info *oldinfo, ++ struct ipt_table_info *newinfo) ++{ ++ struct user_beancounter *old_ub, *new_ub; ++ int old_number, new_number; ++ ++ old_ub = vmalloc_ub(oldinfo); ++ new_ub = vmalloc_ub(newinfo); ++ old_number = oldinfo->number; ++ new_number = newinfo->number; ++ ++ /* XXX: I don't understand the code below and am not sure that it does ++ * something reasonable. 2002/04/26 SAW */ ++ if (old_ub == new_ub) { ++ int charge; ++ /* charge only differences in entries */ ++ charge = new_number - old_number; ++ if (charge > 0) { ++ if (charge_iptables(old_ub, charge)) ++ return -1; ++ } else ++ uncharge_iptables(old_ub, -charge); ++ } else { ++ /* different contexts; do charge current and uncharge old */ ++ if (charge_iptables(new_ub, new_number)) ++ return -1; ++ uncharge_iptables(old_ub, old_number); ++ } ++ return 0; ++} ++#endif ++ ++static int setup_table(struct ipt_table *table, struct ipt_table_info *info) ++{ ++#ifdef CONFIG_NETFILTER_DEBUG ++ { ++ struct ipt_entry *table_base; ++ unsigned int i; ++ ++ for (i = 0; i < NR_CPUS; i++) { ++ table_base = ++ (void *)info->entries ++ + TABLE_OFFSET(info, i); ++ ++ table_base->comefrom = 0xdead57ac; ++ } ++ } ++#endif ++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_USER_RESOURCE) ++ { ++ struct user_beancounter *ub; ++ ++ ub = vmalloc_ub(info); ++ if (charge_iptables(ub, info->number)) ++ return -ENOMEM; ++ } ++#endif ++ table->private = info; ++ info->initial_entries = 0; ++ return 0; ++} ++ + static struct ipt_table_info * + replace_table(struct ipt_table *table, + unsigned int num_counters, +@@ -967,6 +1080,16 @@ replace_table(struct ipt_table *table, + return NULL; + } + oldinfo = table->private; ++ ++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_USER_RESOURCE) ++ if (charge_replace_table(oldinfo, newinfo)) { ++ oldinfo = NULL; ++ write_unlock_bh(&table->lock); ++ *error = -ENOMEM; ++ return NULL; ++ } ++#endif ++ + table->private = newinfo; + newinfo->initial_entries = oldinfo->initial_entries; + write_unlock_bh(&table->lock); +@@ -1017,7 +1140,7 @@ copy_entries_to_user(unsigned int total_ + (other than comefrom, which userspace doesn't care + about). */ + countersize = sizeof(struct ipt_counters) * table->private->number; +- counters = vmalloc(countersize); ++ counters = ub_vmalloc_best(countersize); + + if (counters == NULL) + return -ENOMEM; +@@ -1130,7 +1253,7 @@ do_replace(void __user *user, unsigned i + if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) + return -ENOMEM; + +- newinfo = vmalloc(sizeof(struct ipt_table_info) ++ newinfo = ub_vmalloc_best(sizeof(struct ipt_table_info) + + SMP_ALIGN(tmp.size) * + (highest_possible_processor_id()+1)); + if (!newinfo) +@@ -1142,7 +1265,7 @@ do_replace(void __user *user, unsigned i + goto free_newinfo; + } + +- counters = vmalloc(tmp.num_counters * sizeof(struct ipt_counters)); ++ counters = ub_vmalloc_best(tmp.num_counters * sizeof(struct ipt_counters)); + if (!counters) { + ret = -ENOMEM; + goto free_newinfo; +@@ -1246,7 +1369,7 @@ do_add_counters(void __user *user, unsig + if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters)) + return -EINVAL; + +- paddc = vmalloc(len); ++ paddc = ub_vmalloc_best(len); + if (!paddc) + return -ENOMEM; + +@@ -1288,7 +1411,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, + { + int ret; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + return -EPERM; + + switch (cmd) { +@@ -1313,7 +1436,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, + { + int ret; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + return -EPERM; + + switch (cmd) { +@@ -1418,19 +1541,69 @@ ipt_register_target(struct ipt_target *t + ret = down_interruptible(&ipt_mutex); + if (ret != 0) + return ret; +- list_add(&target->list, &ipt_target); ++ list_add(&target->list, &ve_ipt_target); + up(&ipt_mutex); + return ret; + } + ++int ++virt_ipt_register_target(struct ipt_target *target) ++{ ++ int ret; ++ struct module *mod = target->me; ++ ++ if (!ve_is_super(get_exec_env())) { ++ struct ipt_target *tmp; ++ __module_get(mod); ++ ret = -ENOMEM; ++ tmp = ub_kmalloc(sizeof(struct ipt_target), GFP_KERNEL); ++ if (!tmp) ++ goto nomem; ++ memcpy(tmp, target, sizeof(struct ipt_target)); ++ target = tmp; ++ } ++ ++ ret = ipt_register_target(target); ++ if (ret) ++ goto out; ++ ++ return 0; ++out: ++ if (!ve_is_super(get_exec_env())) { ++ kfree(target); ++nomem: ++ module_put(mod); ++ } ++ return ret; ++} ++ + void + ipt_unregister_target(struct ipt_target *target) + { + down(&ipt_mutex); +- LIST_DELETE(&ipt_target, target); ++ LIST_DELETE(&ve_ipt_target, target); + up(&ipt_mutex); + } + ++void ++virt_ipt_unregister_target(struct ipt_target *target) ++{ ++ if (!ve_is_super(get_exec_env())) { ++ down(&ipt_mutex); ++ target = list_named_find(&ve_ipt_target, target->name); ++ up(&ipt_mutex); ++ if (!target) ++ return; ++ } ++ ++ ipt_unregister_target(target); ++ ++ if (!ve_is_super(get_exec_env())) { ++ module_put(target->me); ++ kfree(target); ++ } ++} ++ + int + ipt_register_match(struct ipt_match *match) + { +@@ -1440,17 +1613,81 @@ ipt_register_match(struct ipt_match *mat + if (ret != 0) + return ret; + +- list_add(&match->list, &ipt_match); ++ list_add(&match->list, &ve_ipt_match); + up(&ipt_mutex); + + return ret; + } + ++int ++virt_ipt_register_match(struct ipt_match *match) ++{ ++ int ret; ++ struct module *mod = match->me; ++ ++ if (!ve_is_super(get_exec_env())) { ++ struct ipt_match *tmp; ++ __module_get(mod); ++ ret = -ENOMEM; ++ tmp = ub_kmalloc(sizeof(struct ipt_match), GFP_KERNEL); ++ if (!tmp) ++ goto nomem; ++ memcpy(tmp, match, sizeof(struct ipt_match)); ++ match = tmp; ++ } ++ ++ ret = ipt_register_match(match); ++ if (ret) ++ goto out; ++ ++ return 0; ++out: ++ if (!ve_is_super(get_exec_env())) { ++ kfree(match); ++nomem: ++ module_put(mod); ++ } ++ return ret; ++} ++ + void + ipt_unregister_match(struct ipt_match *match) + { + down(&ipt_mutex); +- LIST_DELETE(&ipt_match, match); ++ LIST_DELETE(&ve_ipt_match, match); ++ up(&ipt_mutex); ++} ++ ++void ++virt_ipt_unregister_match(struct ipt_match *match) ++{ ++ if (!ve_is_super(get_exec_env())) { ++ down(&ipt_mutex); ++ match = list_named_find(&ve_ipt_match, match->name); ++ up(&ipt_mutex); ++ if (!match) ++ return; ++ } ++ ++ ipt_unregister_match(match); ++ ++ if (!ve_is_super(get_exec_env())) { ++ module_put(match->me); ++ kfree(match); ++ } ++} ++ ++void ipt_flush_table(struct ipt_table *table) ++{ ++ if (table == NULL) ++ return; ++ ++ down(&ipt_mutex); ++ IPT_ENTRY_ITERATE(table->private->entries, table->private->size, ++ cleanup_entry, NULL); ++ if (table->private->number > table->private->initial_entries) ++ module_put(table->me); ++ table->private->size = 0; + up(&ipt_mutex); + } + +@@ -1458,14 +1695,13 @@ int ipt_register_table(struct ipt_table + { + int ret; + struct ipt_table_info *newinfo; +- static struct ipt_table_info bootstrap +- = { 0, 0, 0, { 0 }, { 0 }, { } }; + +- newinfo = vmalloc(sizeof(struct ipt_table_info) ++ ret = -ENOMEM; ++ newinfo = ub_vmalloc_best(sizeof(struct ipt_table_info) + + SMP_ALIGN(repl->size) * + (highest_possible_processor_id()+1)); + if (!newinfo) +- return -ENOMEM; ++ goto out; + + memcpy(newinfo->entries, repl->entries, repl->size); + +@@ -1474,27 +1710,21 @@ int ipt_register_table(struct ipt_table + repl->num_entries, + repl->hook_entry, + repl->underflow); +- if (ret != 0) { +- vfree(newinfo); +- return ret; +- } ++ if (ret != 0) ++ goto out_free; + + ret = down_interruptible(&ipt_mutex); +- if (ret != 0) { +- vfree(newinfo); +- return ret; +- } ++ if (ret != 0) ++ goto out_free; + + /* Don't autoload: we'd eat our tail... */ +- if (list_named_find(&ipt_tables, table->name)) { +- ret = -EEXIST; +- goto free_unlock; +- } +- +- /* Simplifies replace_table code. */ +- table->private = &bootstrap; +- if (!replace_table(table, 0, newinfo, &ret)) +- goto free_unlock; ++ ret = -EEXIST; ++ if (list_named_find(&ve_ipt_tables, table->name)) ++ goto out_free_unlock; ++ ++ ret = setup_table(table, newinfo); ++ if (ret) ++ goto out_free_unlock; + + duprintf("table->private->number = %u\n", + table->private->number); +@@ -1503,29 +1733,77 @@ int ipt_register_table(struct ipt_table + table->private->initial_entries = table->private->number; + + rwlock_init(&table->lock); +- list_prepend(&ipt_tables, table); ++ list_prepend(&ve_ipt_tables, table); ++ up(&ipt_mutex); ++ return 0; + +- unlock: ++out_free_unlock: + up(&ipt_mutex); ++out_free: ++ vfree(newinfo); ++out: + return ret; ++} + +- free_unlock: +- vfree(newinfo); +- goto unlock; ++struct ipt_table * virt_ipt_register_table(struct ipt_table *table, ++ const struct ipt_replace *repl) ++{ ++ int ret; ++ struct module *mod = table->me; ++ ++ if (!ve_is_super(get_exec_env())) { ++ struct ipt_table *tmp; ++ __module_get(mod); ++ ret = -ENOMEM; ++ tmp = ub_kmalloc(sizeof(struct ipt_table), GFP_KERNEL); ++ if (!tmp) ++ goto nomem; ++ memcpy(tmp, table, sizeof(struct ipt_table)); ++ table = tmp; ++ } ++ ++ ret = ipt_register_table(table, repl); ++ if (ret) ++ goto out; ++ ++ return table; ++out: ++ if (!ve_is_super(get_exec_env())) { ++ kfree(table); ++nomem: ++ module_put(mod); ++ } ++ return ERR_PTR(ret); + } + + void ipt_unregister_table(struct ipt_table *table) + { + down(&ipt_mutex); +- LIST_DELETE(&ipt_tables, table); ++ LIST_DELETE(&ve_ipt_tables, table); + up(&ipt_mutex); + ++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_USER_RESOURCE) ++ uncharge_iptables(vmalloc_ub(table->private), ++ table->private->number); ++#endif ++ + /* Decrease module usage counts and free resources */ + IPT_ENTRY_ITERATE(table->private->entries, table->private->size, + cleanup_entry, NULL); + vfree(table->private); + } + ++void ++virt_ipt_unregister_table(struct ipt_table *table) ++{ ++ ipt_unregister_table(table); ++ ++ if (!ve_is_super(get_exec_env())) { ++ module_put(table->me); ++ kfree(table); ++ } ++} ++ + /* Returns 1 if the port is matched by the range, 0 otherwise */ + static inline int + port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert) +@@ -1835,7 +2113,7 @@ static inline int print_target(const str + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) + { +- if (t == &ipt_standard_target || t == &ipt_error_target) ++ if (t == &ve_ipt_standard_target || t == &ve_ipt_error_target) + return 0; + return print_name((char *)t, start_offset, buffer, length, pos, count); + } +@@ -1845,10 +2123,16 @@ static int ipt_get_tables(char *buffer, + off_t pos = 0; + unsigned int count = 0; + ++#ifdef CONFIG_VE_IPTABLES ++ /* if we don't initialized for current VE exiting */ ++ if (&ve_ipt_standard_target == NULL) ++ return 0; ++#endif ++ + if (down_interruptible(&ipt_mutex) != 0) + return 0; + +- LIST_FIND(&ipt_tables, print_name, void *, ++ LIST_FIND(&ve_ipt_tables, print_name, void *, + offset, buffer, length, &pos, &count); + + up(&ipt_mutex); +@@ -1866,7 +2150,7 @@ static int ipt_get_targets(char *buffer, + if (down_interruptible(&ipt_mutex) != 0) + return 0; + +- LIST_FIND(&ipt_target, print_target, struct ipt_target *, ++ LIST_FIND(&ve_ipt_target, print_target, struct ipt_target *, + offset, buffer, length, &pos, &count); + + up(&ipt_mutex); +@@ -1883,7 +2167,7 @@ static int ipt_get_matches(char *buffer, + if (down_interruptible(&ipt_mutex) != 0) + return 0; + +- LIST_FIND(&ipt_match, print_name, void *, ++ LIST_FIND(&ve_ipt_match, print_name, void *, + offset, buffer, length, &pos, &count); + + up(&ipt_mutex); +@@ -1899,6 +2183,36 @@ static const struct { char *name; get_in + { NULL, NULL} }; + #endif /*CONFIG_PROC_FS*/ + ++static int init_proc_entries(void) ++{ ++#ifdef CONFIG_PROC_FS ++ struct proc_dir_entry *proc; ++ int i; ++ ++ for (i = 0; ipt_proc_entry[i].name; i++) { ++ proc = proc_net_create(ipt_proc_entry[i].name, 0, ++ ipt_proc_entry[i].get_info); ++ if (!proc) { ++ while (--i >= 0) ++ proc_net_remove(ipt_proc_entry[i].name); ++ return -ENOMEM; ++ } ++ proc->owner = THIS_MODULE; ++ } ++#endif ++ return 0; ++} ++ ++static void fini_proc_entries(void) ++{ ++#ifdef CONFIG_PROC_FS ++ int i; ++ for (i = 0; ipt_proc_entry[i].name; i++) ++ proc_net_remove(ipt_proc_entry[i].name); ++#endif ++} ++ ++void fini_iptables(void); + static int __init init(void) + { + int ret; +@@ -1919,49 +2233,169 @@ static int __init init(void) + return ret; + } + +-#ifdef CONFIG_PROC_FS +- { +- struct proc_dir_entry *proc; +- int i; +- +- for (i = 0; ipt_proc_entry[i].name; i++) { +- proc = proc_net_create(ipt_proc_entry[i].name, 0, +- ipt_proc_entry[i].get_info); +- if (!proc) { +- while (--i >= 0) +- proc_net_remove(ipt_proc_entry[i].name); +- nf_unregister_sockopt(&ipt_sockopts); +- return -ENOMEM; +- } +- proc->owner = THIS_MODULE; ++ ret = init_proc_entries(); ++ if (ret) { ++ nf_unregister_sockopt(&ipt_sockopts); ++ return ret; + } ++ ++ printk("ip_tables: (C) 2000-2002 Netfilter core team\n"); ++ ++#if defined(CONFIG_VE_IPTABLES) ++ /* init ve0 */ ++ ret = init_iptables(); ++ if (ret) { ++ fini_proc_entries(); ++ nf_unregister_sockopt(&ipt_sockopts); ++ return ret; + } ++ ++ KSYMRESOLVE(init_iptables); ++ KSYMRESOLVE(fini_iptables); ++ KSYMRESOLVE(ipt_flush_table); ++ KSYMMODRESOLVE(ip_tables); + #endif ++ return ret; ++} ++ ++#ifdef CONFIG_VE_IPTABLES ++/* alloc helper */ ++#define ALLOC_ENVF(field,label) \ ++ if ( !(envid->field = kmalloc(sizeof(*(envid->field)), GFP_KERNEL)) ) \ ++ goto label; ++int init_iptables(void) ++{ ++ struct ve_struct *envid; ++ ++ envid = get_exec_env(); ++ ++ if (ve_is_super(envid)) { ++ envid->_ipt_target = &ipt_target; ++ envid->_ipt_match = &ipt_match; ++ envid->_ipt_tables = &ipt_tables; ++ ++ envid->_ipt_standard_target = &ipt_standard_target; ++ envid->_ipt_error_target = &ipt_error_target; ++ envid->_tcp_matchstruct = &tcp_matchstruct; ++ envid->_udp_matchstruct = &udp_matchstruct; ++ envid->_icmp_matchstruct = &icmp_matchstruct; ++ } else { ++ /* allocate structures in ve_struct */ ++ ALLOC_ENVF(_ipt_target,nomem0); ++ ALLOC_ENVF(_ipt_match,nomem1); ++ ALLOC_ENVF(_ipt_tables,nomem2); ++ ALLOC_ENVF(_ipt_standard_target,nomem3); ++ ALLOC_ENVF(_ipt_error_target,nomem4); ++ ALLOC_ENVF(_tcp_matchstruct,nomem5); ++ ALLOC_ENVF(_udp_matchstruct,nomem6); ++ ALLOC_ENVF(_icmp_matchstruct,nomem7); ++ ++ /* FIXME: charge ubc */ ++ INIT_LIST_HEAD(envid->_ipt_target); ++ INIT_LIST_HEAD(envid->_ipt_match); ++ INIT_LIST_HEAD(envid->_ipt_tables); ++ ++ memcpy(envid->_ipt_standard_target, &ipt_standard_target, ++ sizeof(ipt_standard_target)); ++ memcpy(envid->_ipt_error_target, &ipt_error_target, ++ sizeof(ipt_error_target)); ++ memcpy(envid->_tcp_matchstruct, &tcp_matchstruct, ++ sizeof(tcp_matchstruct)); ++ memcpy(envid->_udp_matchstruct, &udp_matchstruct, ++ sizeof(udp_matchstruct)); ++ memcpy(envid->_icmp_matchstruct, &icmp_matchstruct, ++ sizeof(icmp_matchstruct)); ++ ++ down(&ipt_mutex); ++ list_append(envid->_ipt_target, envid->_ipt_standard_target); ++ list_append(envid->_ipt_target, envid->_ipt_error_target); ++ list_append(envid->_ipt_match, envid->_tcp_matchstruct); ++ list_append(envid->_ipt_match, envid->_udp_matchstruct); ++ list_append(envid->_ipt_match, envid->_icmp_matchstruct); ++ up(&ipt_mutex); ++ ++ if (init_proc_entries()) ++ goto nomem8; ++ } + +- printk("ip_tables: (C) 2000-2002 Netfilter core team\n"); + return 0; ++ ++nomem8: ++ kfree(envid->_icmp_matchstruct); envid->_icmp_matchstruct = NULL; ++nomem7: ++ kfree(envid->_udp_matchstruct); envid->_udp_matchstruct = NULL; ++nomem6: ++ kfree(envid->_tcp_matchstruct); envid->_tcp_matchstruct = NULL; ++nomem5: ++ kfree(envid->_ipt_error_target); envid->_ipt_error_target = NULL; ++nomem4: ++ kfree(envid->_ipt_standard_target); envid->_ipt_standard_target = NULL; ++nomem3: ++ kfree(envid->_ipt_tables); envid->_ipt_tables = NULL; ++nomem2: ++ kfree(envid->_ipt_match); envid->_ipt_match = NULL; ++nomem1: ++ kfree(envid->_ipt_target); envid->_ipt_target = NULL; ++nomem0: ++ return -ENOMEM; ++} ++ ++void fini_iptables(void) ++{ ++ /* some cleanup */ ++ struct ve_struct *envid = get_exec_env(); ++ ++ if (envid->_ipt_tables != NULL && !ve_is_super(envid)) { ++ kfree(envid->_ipt_tables); ++ kfree(envid->_ipt_target); ++ kfree(envid->_ipt_match); ++ kfree(envid->_ipt_standard_target); ++ kfree(envid->_ipt_error_target); ++ kfree(envid->_tcp_matchstruct); ++ kfree(envid->_udp_matchstruct); ++ kfree(envid->_icmp_matchstruct); ++ fini_proc_entries(); ++ } ++ ++ envid->_ipt_tables = NULL; ++ envid->_ipt_target = NULL; ++ envid->_ipt_match = NULL; ++ envid->_ipt_standard_target = NULL; ++ envid->_ipt_error_target = NULL; ++ envid->_tcp_matchstruct = NULL; ++ envid->_udp_matchstruct = NULL; ++ envid->_icmp_matchstruct = NULL; + } ++#endif + + static void __exit fini(void) + { ++ KSYMMODUNRESOLVE(ip_tables); ++ KSYMUNRESOLVE(init_iptables); ++ KSYMUNRESOLVE(fini_iptables); ++ KSYMUNRESOLVE(ipt_flush_table); + nf_unregister_sockopt(&ipt_sockopts); +-#ifdef CONFIG_PROC_FS +- { +- int i; +- for (i = 0; ipt_proc_entry[i].name; i++) +- proc_net_remove(ipt_proc_entry[i].name); +- } ++ fini_proc_entries(); ++#ifdef CONFIG_VE_IPTABLES ++ fini_iptables(); + #endif + } + ++EXPORT_SYMBOL(ipt_flush_table); + EXPORT_SYMBOL(ipt_register_table); + EXPORT_SYMBOL(ipt_unregister_table); ++EXPORT_SYMBOL(virt_ipt_register_table); ++EXPORT_SYMBOL(virt_ipt_unregister_table); + EXPORT_SYMBOL(ipt_register_match); + EXPORT_SYMBOL(ipt_unregister_match); + EXPORT_SYMBOL(ipt_do_table); ++EXPORT_SYMBOL(virt_ipt_register_match); ++EXPORT_SYMBOL(virt_ipt_unregister_match); + EXPORT_SYMBOL(ipt_register_target); + EXPORT_SYMBOL(ipt_unregister_target); ++EXPORT_SYMBOL(virt_ipt_register_target); ++EXPORT_SYMBOL(virt_ipt_unregister_target); + EXPORT_SYMBOL(ipt_find_target); + +-module_init(init); ++subsys_initcall(init); + module_exit(fini); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ipt_CLASSIFY.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_CLASSIFY.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ipt_CLASSIFY.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_CLASSIFY.c 2006-01-27 14:48:08.000000000 +0300 +@@ -46,7 +46,8 @@ checkentry(const char *tablename, + unsigned int hook_mask) + { + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_classify_target_info))){ +- printk(KERN_ERR "CLASSIFY: invalid size (%u != %Zu).\n", ++ ve_printk(VE_LOG, KERN_ERR ++ "CLASSIFY: invalid size (%u != %Zu).\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_classify_target_info))); + return 0; +@@ -54,13 +55,14 @@ checkentry(const char *tablename, + + if (hook_mask & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_FORWARD) | + (1 << NF_IP_POST_ROUTING))) { +- printk(KERN_ERR "CLASSIFY: only valid in LOCAL_OUT, FORWARD " ++ ve_printk(VE_LOG, KERN_ERR ++ "CLASSIFY: only valid in LOCAL_OUT, FORWARD " + "and POST_ROUTING.\n"); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { +- printk(KERN_ERR "CLASSIFY: can only be called from " ++ ve_printk(VE_LOG, KERN_ERR "CLASSIFY: can only be called from " + "\"mangle\" table, not \"%s\".\n", + tablename); + return 0; +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ipt_LOG.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_LOG.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ipt_LOG.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_LOG.c 2006-01-27 14:48:08.000000000 +0300 +@@ -18,6 +18,7 @@ + #include <net/udp.h> + #include <net/tcp.h> + #include <net/route.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter.h> + #include <linux/netfilter_ipv4/ip_tables.h> +@@ -51,32 +52,32 @@ static void dump_packet(const struct nf_ + + ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); + if (ih == NULL) { +- printk("TRUNCATED"); ++ ve_printk(VE_LOG, "TRUNCATED"); + return; + } + + /* Important fields: + * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ + /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ +- printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ", ++ ve_printk(VE_LOG, "SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ", + NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); + + /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ +- printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", ++ ve_printk(VE_LOG, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); + + /* Max length: 6 "CE DF MF " */ + if (ntohs(ih->frag_off) & IP_CE) +- printk("CE "); ++ ve_printk(VE_LOG, "CE "); + if (ntohs(ih->frag_off) & IP_DF) +- printk("DF "); ++ ve_printk(VE_LOG, "DF "); + if (ntohs(ih->frag_off) & IP_MF) +- printk("MF "); ++ ve_printk(VE_LOG, "MF "); + + /* Max length: 11 "FRAG:65535 " */ + if (ntohs(ih->frag_off) & IP_OFFSET) +- printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); ++ ve_printk(VE_LOG, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + + if ((logflags & IPT_LOG_IPOPT) + && ih->ihl * 4 > sizeof(struct iphdr)) { +@@ -87,15 +88,15 @@ static void dump_packet(const struct nf_ + op = skb_header_pointer(skb, iphoff+sizeof(_iph), + optsize, _opt); + if (op == NULL) { +- printk("TRUNCATED"); ++ ve_printk(VE_LOG, "TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ +- printk("OPT ("); ++ ve_printk(VE_LOG, "OPT ("); + for (i = 0; i < optsize; i++) +- printk("%02X", op[i]); +- printk(") "); ++ ve_printk(VE_LOG, "%02X", op[i]); ++ ve_printk(VE_LOG, ") "); + } + + switch (ih->protocol) { +@@ -103,7 +104,7 @@ static void dump_packet(const struct nf_ + struct tcphdr _tcph, *th; + + /* Max length: 10 "PROTO=TCP " */ +- printk("PROTO=TCP "); ++ ve_printk(VE_LOG, "PROTO=TCP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; +@@ -112,41 +113,41 @@ static void dump_packet(const struct nf_ + th = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) { +- printk("INCOMPLETE [%u bytes] ", ++ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ +- printk("SPT=%u DPT=%u ", ++ ve_printk(VE_LOG, "SPT=%u DPT=%u ", + ntohs(th->source), ntohs(th->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (logflags & IPT_LOG_TCPSEQ) +- printk("SEQ=%u ACK=%u ", ++ ve_printk(VE_LOG, "SEQ=%u ACK=%u ", + ntohl(th->seq), ntohl(th->ack_seq)); + /* Max length: 13 "WINDOW=65535 " */ +- printk("WINDOW=%u ", ntohs(th->window)); ++ ve_printk(VE_LOG, "WINDOW=%u ", ntohs(th->window)); + /* Max length: 9 "RES=0x3F " */ +- printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); ++ ve_printk(VE_LOG, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); + /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->cwr) +- printk("CWR "); ++ ve_printk(VE_LOG, "CWR "); + if (th->ece) +- printk("ECE "); ++ ve_printk(VE_LOG, "ECE "); + if (th->urg) +- printk("URG "); ++ ve_printk(VE_LOG, "URG "); + if (th->ack) +- printk("ACK "); ++ ve_printk(VE_LOG, "ACK "); + if (th->psh) +- printk("PSH "); ++ ve_printk(VE_LOG, "PSH "); + if (th->rst) +- printk("RST "); ++ ve_printk(VE_LOG, "RST "); + if (th->syn) +- printk("SYN "); ++ ve_printk(VE_LOG, "SYN "); + if (th->fin) +- printk("FIN "); ++ ve_printk(VE_LOG, "FIN "); + /* Max length: 11 "URGP=65535 " */ +- printk("URGP=%u ", ntohs(th->urg_ptr)); ++ ve_printk(VE_LOG, "URGP=%u ", ntohs(th->urg_ptr)); + + if ((logflags & IPT_LOG_TCPOPT) + && th->doff * 4 > sizeof(struct tcphdr)) { +@@ -159,15 +160,15 @@ static void dump_packet(const struct nf_ + iphoff+ih->ihl*4+sizeof(_tcph), + optsize, _opt); + if (op == NULL) { +- printk("TRUNCATED"); ++ ve_printk(VE_LOG, "TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ +- printk("OPT ("); ++ ve_printk(VE_LOG, "OPT ("); + for (i = 0; i < optsize; i++) +- printk("%02X", op[i]); +- printk(") "); ++ ve_printk(VE_LOG, "%02X", op[i]); ++ ve_printk(VE_LOG, ") "); + } + break; + } +@@ -175,7 +176,7 @@ static void dump_packet(const struct nf_ + struct udphdr _udph, *uh; + + /* Max length: 10 "PROTO=UDP " */ +- printk("PROTO=UDP "); ++ ve_printk(VE_LOG, "PROTO=UDP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; +@@ -184,13 +185,13 @@ static void dump_packet(const struct nf_ + uh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_udph), &_udph); + if (uh == NULL) { +- printk("INCOMPLETE [%u bytes] ", ++ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ +- printk("SPT=%u DPT=%u LEN=%u ", ++ ve_printk(VE_LOG, "SPT=%u DPT=%u LEN=%u ", + ntohs(uh->source), ntohs(uh->dest), + ntohs(uh->len)); + break; +@@ -216,7 +217,7 @@ static void dump_packet(const struct nf_ + [ICMP_ADDRESSREPLY] = 12 }; + + /* Max length: 11 "PROTO=ICMP " */ +- printk("PROTO=ICMP "); ++ ve_printk(VE_LOG, "PROTO=ICMP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; +@@ -225,19 +226,19 @@ static void dump_packet(const struct nf_ + ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_icmph), &_icmph); + if (ich == NULL) { +- printk("INCOMPLETE [%u bytes] ", ++ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ +- printk("TYPE=%u CODE=%u ", ich->type, ich->code); ++ ve_printk(VE_LOG, "TYPE=%u CODE=%u ", ich->type, ich->code); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (ich->type <= NR_ICMP_TYPES + && required_len[ich->type] + && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { +- printk("INCOMPLETE [%u bytes] ", ++ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } +@@ -246,19 +247,19 @@ static void dump_packet(const struct nf_ + case ICMP_ECHOREPLY: + case ICMP_ECHO: + /* Max length: 19 "ID=65535 SEQ=65535 " */ +- printk("ID=%u SEQ=%u ", ++ ve_printk(VE_LOG, "ID=%u SEQ=%u ", + ntohs(ich->un.echo.id), + ntohs(ich->un.echo.sequence)); + break; + + case ICMP_PARAMETERPROB: + /* Max length: 14 "PARAMETER=255 " */ +- printk("PARAMETER=%u ", ++ ve_printk(VE_LOG, "PARAMETER=%u ", + ntohl(ich->un.gateway) >> 24); + break; + case ICMP_REDIRECT: + /* Max length: 24 "GATEWAY=255.255.255.255 " */ +- printk("GATEWAY=%u.%u.%u.%u ", ++ ve_printk(VE_LOG, "GATEWAY=%u.%u.%u.%u ", + NIPQUAD(ich->un.gateway)); + /* Fall through */ + case ICMP_DEST_UNREACH: +@@ -266,16 +267,16 @@ static void dump_packet(const struct nf_ + case ICMP_TIME_EXCEEDED: + /* Max length: 3+maxlen */ + if (!iphoff) { /* Only recurse once. */ +- printk("["); ++ ve_printk(VE_LOG, "["); + dump_packet(info, skb, + iphoff + ih->ihl*4+sizeof(_icmph)); +- printk("] "); ++ ve_printk(VE_LOG, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ich->type == ICMP_DEST_UNREACH + && ich->code == ICMP_FRAG_NEEDED) +- printk("MTU=%u ", ntohs(ich->un.frag.mtu)); ++ ve_printk(VE_LOG, "MTU=%u ", ntohs(ich->un.frag.mtu)); + } + break; + } +@@ -287,26 +288,26 @@ static void dump_packet(const struct nf_ + break; + + /* Max length: 9 "PROTO=AH " */ +- printk("PROTO=AH "); ++ ve_printk(VE_LOG, "PROTO=AH "); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ah = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_ahdr), &_ahdr); + if (ah == NULL) { +- printk("INCOMPLETE [%u bytes] ", ++ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ +- printk("SPI=0x%x ", ntohl(ah->spi)); ++ ve_printk(VE_LOG, "SPI=0x%x ", ntohl(ah->spi)); + break; + } + case IPPROTO_ESP: { + struct ip_esp_hdr _esph, *eh; + + /* Max length: 10 "PROTO=ESP " */ +- printk("PROTO=ESP "); ++ ve_printk(VE_LOG, "PROTO=ESP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; +@@ -315,25 +316,25 @@ static void dump_packet(const struct nf_ + eh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_esph), &_esph); + if (eh == NULL) { +- printk("INCOMPLETE [%u bytes] ", ++ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ +- printk("SPI=0x%x ", ntohl(eh->spi)); ++ ve_printk(VE_LOG, "SPI=0x%x ", ntohl(eh->spi)); + break; + } + /* Max length: 10 "PROTO 255 " */ + default: +- printk("PROTO=%u ", ih->protocol); ++ ve_printk(VE_LOG, "PROTO=%u ", ih->protocol); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket && skb->sk->sk_socket->file) +- printk("UID=%u ", skb->sk->sk_socket->file->f_uid); ++ ve_printk(VE_LOG, "UID=%u ", skb->sk->sk_socket->file->f_uid); + read_unlock_bh(&skb->sk->sk_callback_lock); + } + +@@ -374,7 +375,7 @@ ipt_log_packet(unsigned int pf, + loginfo = &default_loginfo; + + spin_lock_bh(&log_lock); +- printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, ++ ve_printk(VE_LOG, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + prefix, + in ? in->name : "", + out ? out->name : ""); +@@ -384,29 +385,29 @@ ipt_log_packet(unsigned int pf, + struct net_device *physoutdev = skb->nf_bridge->physoutdev; + + if (physindev && in != physindev) +- printk("PHYSIN=%s ", physindev->name); ++ ve_printk(VE_LOG, "PHYSIN=%s ", physindev->name); + if (physoutdev && out != physoutdev) +- printk("PHYSOUT=%s ", physoutdev->name); ++ ve_printk(VE_LOG, "PHYSOUT=%s ", physoutdev->name); + } + #endif + + if (in && !out) { + /* MAC logging for input chain only. */ +- printk("MAC="); ++ ve_printk(VE_LOG, "MAC="); + if (skb->dev && skb->dev->hard_header_len + && skb->mac.raw != (void*)skb->nh.iph) { + int i; + unsigned char *p = skb->mac.raw; + for (i = 0; i < skb->dev->hard_header_len; i++,p++) +- printk("%02x%c", *p, ++ ve_printk(VE_LOG, "%02x%c", *p, + i==skb->dev->hard_header_len - 1 + ? ' ':':'); + } else +- printk(" "); ++ ve_printk(VE_LOG, " "); + } + + dump_packet(loginfo, skb, 0); +- printk("\n"); ++ ve_printk(VE_LOG, "\n"); + spin_unlock_bh(&log_lock); + } + +@@ -471,24 +472,44 @@ static struct nf_logger ipt_log_logger = + .me = THIS_MODULE, + }; + ++int init_iptable_LOG(void) ++{ ++ return virt_ipt_register_target(&ipt_log_reg); ++} ++ ++void fini_iptable_LOG(void) ++{ ++ virt_ipt_unregister_target(&ipt_log_reg); ++} ++ + static int __init init(void) + { +- if (ipt_register_target(&ipt_log_reg)) +- return -EINVAL; ++ int err; ++ ++ err = init_iptable_LOG(); ++ if (err < 0) ++ return err; + if (nf_log_register(PF_INET, &ipt_log_logger) < 0) { +- printk(KERN_WARNING "ipt_LOG: not logging via system console " ++ ve_printk(VE_LOG, KERN_WARNING "ipt_LOG: not logging via system console " + "since somebody else already registered for PF_INET\n"); + /* we cannot make module load fail here, since otherwise + * iptables userspace would abort */ + } + ++ ++ KSYMRESOLVE(init_iptable_LOG); ++ KSYMRESOLVE(fini_iptable_LOG); ++ KSYMMODRESOLVE(ipt_LOG); + return 0; + } + + static void __exit fini(void) + { ++ KSYMMODUNRESOLVE(ipt_LOG); ++ KSYMUNRESOLVE(init_iptable_LOG); ++ KSYMUNRESOLVE(fini_iptable_LOG); + nf_log_unregister_logger(&ipt_log_logger); +- ipt_unregister_target(&ipt_log_reg); ++ fini_iptable_LOG(); + } + + module_init(init); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ipt_MARK.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_MARK.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ipt_MARK.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_MARK.c 2006-01-27 14:48:08.000000000 +0300 +@@ -77,14 +77,15 @@ checkentry_v0(const char *tablename, + struct ipt_mark_target_info *markinfo = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) { +- printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", ++ ve_printk(VE_LOG, KERN_WARNING "MARK: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_mark_target_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { +- printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename); ++ ve_printk(VE_LOG, KERN_WARNING "MARK: can only be called from " ++ "\"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ipt_MASQUERADE.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_MASQUERADE.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ipt_MASQUERADE.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_MASQUERADE.c 2006-01-27 14:48:08.000000000 +0300 +@@ -118,6 +118,7 @@ masquerade_target(struct sk_buff **pskb, + return ip_nat_setup_info(ct, &newrange, hooknum); + } + ++#if 0 + static inline int + device_cmp(struct ip_conntrack *i, void *ifindex) + { +@@ -173,6 +174,7 @@ static struct notifier_block masq_dev_no + static struct notifier_block masq_inet_notifier = { + .notifier_call = masq_inet_event, + }; ++#endif + + static struct ipt_target masquerade = { + .name = "MASQUERADE", +@@ -187,12 +189,16 @@ static int __init init(void) + + ret = ipt_register_target(&masquerade); + ++#if 0 ++/* These notifiers are unnecessary and may ++ lead to oops in virtual environments */ + if (ret == 0) { + /* Register for device down reports */ + register_netdevice_notifier(&masq_dev_notifier); + /* Register IP address change reports */ + register_inetaddr_notifier(&masq_inet_notifier); + } ++#endif + + return ret; + } +@@ -200,8 +206,8 @@ static int __init init(void) + static void __exit fini(void) + { + ipt_unregister_target(&masquerade); +- unregister_netdevice_notifier(&masq_dev_notifier); +- unregister_inetaddr_notifier(&masq_inet_notifier); ++/* unregister_netdevice_notifier(&masq_dev_notifier); ++ unregister_inetaddr_notifier(&masq_inet_notifier); */ + } + + module_init(init); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ipt_REJECT.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_REJECT.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ipt_REJECT.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_REJECT.c 2006-01-27 14:48:08.000000000 +0300 +@@ -22,6 +22,7 @@ + #include <net/ip.h> + #include <net/tcp.h> + #include <net/route.h> ++#include <linux/nfcalls.h> + #include <net/dst.h> + #include <linux/netfilter_ipv4/ip_tables.h> + #include <linux/netfilter_ipv4/ipt_REJECT.h> +@@ -307,7 +308,7 @@ static int check(const char *tablename, + } + + if (rejinfo->with == IPT_ICMP_ECHOREPLY) { +- printk("REJECT: ECHOREPLY no longer supported.\n"); ++ ve_printk(VE_LOG, "REJECT: ECHOREPLY no longer supported.\n"); + return 0; + } else if (rejinfo->with == IPT_TCP_RESET) { + /* Must specify that it's a TCP packet */ +@@ -328,14 +329,36 @@ static struct ipt_target ipt_reject_reg + .me = THIS_MODULE, + }; + ++int init_iptable_REJECT(void) ++{ ++ return virt_ipt_register_target(&ipt_reject_reg); ++} ++ ++void fini_iptable_REJECT(void) ++{ ++ virt_ipt_unregister_target(&ipt_reject_reg); ++} ++ + static int __init init(void) + { +- return ipt_register_target(&ipt_reject_reg); ++ int err; ++ ++ err = init_iptable_REJECT(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_REJECT); ++ KSYMRESOLVE(fini_iptable_REJECT); ++ KSYMMODRESOLVE(ipt_REJECT); ++ return 0; + } + + static void __exit fini(void) + { +- ipt_unregister_target(&ipt_reject_reg); ++ KSYMMODUNRESOLVE(ipt_REJECT); ++ KSYMUNRESOLVE(init_iptable_REJECT); ++ KSYMUNRESOLVE(fini_iptable_REJECT); ++ fini_iptable_REJECT(); + } + + module_init(init); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ipt_TCPMSS.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_TCPMSS.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ipt_TCPMSS.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_TCPMSS.c 2006-01-27 14:48:08.000000000 +0300 +@@ -13,6 +13,7 @@ + + #include <linux/ip.h> + #include <net/tcp.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter_ipv4/ip_tables.h> + #include <linux/netfilter_ipv4/ipt_TCPMSS.h> +@@ -228,7 +229,8 @@ ipt_tcpmss_checkentry(const char *tablen + ((hook_mask & ~((1 << NF_IP_FORWARD) + | (1 << NF_IP_LOCAL_OUT) + | (1 << NF_IP_POST_ROUTING))) != 0)) { +- printk("TCPMSS: path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); ++ ve_printk(VE_LOG, "TCPMSS: path-MTU clamping only supported in " ++ "FORWARD, OUTPUT and POSTROUTING hooks\n"); + return 0; + } + +@@ -237,7 +239,7 @@ ipt_tcpmss_checkentry(const char *tablen + && IPT_MATCH_ITERATE(e, find_syn_match)) + return 1; + +- printk("TCPMSS: Only works on TCP SYN packets\n"); ++ ve_printk(VE_LOG, "TCPMSS: Only works on TCP SYN packets\n"); + return 0; + } + +@@ -248,14 +250,36 @@ static struct ipt_target ipt_tcpmss_reg + .me = THIS_MODULE, + }; + ++int init_iptable_TCPMSS(void) ++{ ++ return virt_ipt_register_target(&ipt_tcpmss_reg); ++} ++ ++void fini_iptable_TCPMSS(void) ++{ ++ virt_ipt_unregister_target(&ipt_tcpmss_reg); ++} ++ + static int __init init(void) + { +- return ipt_register_target(&ipt_tcpmss_reg); ++ int err; ++ ++ err = init_iptable_TCPMSS(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_TCPMSS); ++ KSYMRESOLVE(fini_iptable_TCPMSS); ++ KSYMMODRESOLVE(ipt_TCPMSS); ++ return 0; + } + + static void __exit fini(void) + { +- ipt_unregister_target(&ipt_tcpmss_reg); ++ KSYMMODUNRESOLVE(ipt_TCPMSS); ++ KSYMUNRESOLVE(init_iptable_TCPMSS); ++ KSYMUNRESOLVE(fini_iptable_TCPMSS); ++ fini_iptable_TCPMSS(); + } + + module_init(init); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ipt_TOS.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_TOS.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ipt_TOS.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_TOS.c 2006-01-27 14:48:08.000000000 +0300 +@@ -15,6 +15,7 @@ + + #include <linux/netfilter_ipv4/ip_tables.h> + #include <linux/netfilter_ipv4/ipt_TOS.h> ++#include <linux/nfcalls.h> + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +@@ -60,14 +61,15 @@ checkentry(const char *tablename, + const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tos_target_info))) { +- printk(KERN_WARNING "TOS: targinfosize %u != %Zu\n", ++ ve_printk(VE_LOG, KERN_WARNING "TOS: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_tos_target_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { +- printk(KERN_WARNING "TOS: can only be called from \"mangle\" table, not \"%s\"\n", tablename); ++ ve_printk(VE_LOG, KERN_WARNING "TOS: can only be called from " ++ "\"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + +@@ -76,7 +78,7 @@ checkentry(const char *tablename, + && tos != IPTOS_RELIABILITY + && tos != IPTOS_MINCOST + && tos != IPTOS_NORMALSVC) { +- printk(KERN_WARNING "TOS: bad tos value %#x\n", tos); ++ ve_printk(VE_LOG, KERN_WARNING "TOS: bad tos value %#x\n", tos); + return 0; + } + +@@ -90,14 +92,36 @@ static struct ipt_target ipt_tos_reg = { + .me = THIS_MODULE, + }; + ++int init_iptable_TOS(void) ++{ ++ return virt_ipt_register_target(&ipt_tos_reg); ++} ++ ++void fini_iptable_TOS(void) ++{ ++ virt_ipt_unregister_target(&ipt_tos_reg); ++} ++ + static int __init init(void) + { +- return ipt_register_target(&ipt_tos_reg); ++ int err; ++ ++ err = init_iptable_TOS(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_TOS); ++ KSYMRESOLVE(fini_iptable_TOS); ++ KSYMMODRESOLVE(ipt_TOS); ++ return 0; + } + + static void __exit fini(void) + { +- ipt_unregister_target(&ipt_tos_reg); ++ KSYMMODUNRESOLVE(ipt_TOS); ++ KSYMUNRESOLVE(init_iptable_TOS); ++ KSYMUNRESOLVE(fini_iptable_TOS); ++ fini_iptable_TOS(); + } + + module_init(init); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ipt_conntrack.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_conntrack.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ipt_conntrack.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_conntrack.c 2006-01-27 14:48:08.000000000 +0300 +@@ -20,6 +20,7 @@ + + #include <linux/netfilter_ipv4/ip_tables.h> + #include <linux/netfilter_ipv4/ipt_conntrack.h> ++#include <linux/nfcalls.h> + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +@@ -217,15 +218,37 @@ static struct ipt_match conntrack_match + .me = THIS_MODULE, + }; + ++int init_iptable_conntrack_match(void) ++{ ++ return virt_ipt_register_match(&conntrack_match); ++} ++ ++void fini_iptable_conntrack_match(void) ++{ ++ virt_ipt_unregister_match(&conntrack_match); ++} ++ + static int __init init(void) + { ++ int err; ++ + need_ip_conntrack(); +- return ipt_register_match(&conntrack_match); ++ err = init_iptable_conntrack_match(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_conntrack_match); ++ KSYMRESOLVE(fini_iptable_conntrack_match); ++ KSYMMODRESOLVE(ipt_conntrack); ++ return 0; + } + + static void __exit fini(void) + { +- ipt_unregister_match(&conntrack_match); ++ KSYMMODUNRESOLVE(ipt_conntrack); ++ KSYMUNRESOLVE(init_iptable_conntrack_match); ++ KSYMUNRESOLVE(fini_iptable_conntrack_match); ++ fini_iptable_conntrack_match(); + } + + module_init(init); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ipt_helper.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_helper.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ipt_helper.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_helper.c 2006-01-27 14:48:08.000000000 +0300 +@@ -24,6 +24,7 @@ + #endif + #include <linux/netfilter_ipv4/ip_tables.h> + #include <linux/netfilter_ipv4/ipt_helper.h> ++#include <linux/nfcalls.h> + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>"); +@@ -151,15 +152,37 @@ static struct ipt_match helper_match = { + .me = THIS_MODULE, + }; + ++int init_iptable_helper(void) ++{ ++ return virt_ipt_register_match(&helper_match); ++} ++ ++void fini_iptable_helper(void) ++{ ++ virt_ipt_unregister_match(&helper_match); ++} ++ + static int __init init(void) + { ++ int err; ++ + need_ip_conntrack(); +- return ipt_register_match(&helper_match); ++ err = init_iptable_helper(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_helper); ++ KSYMRESOLVE(fini_iptable_helper); ++ KSYMMODRESOLVE(ipt_helper); ++ return 0; + } + + static void __exit fini(void) + { +- ipt_unregister_match(&helper_match); ++ KSYMMODUNRESOLVE(ipt_helper); ++ KSYMUNRESOLVE(init_iptable_helper); ++ KSYMUNRESOLVE(fini_iptable_helper); ++ fini_iptable_helper(); + } + + module_init(init); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ipt_length.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_length.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ipt_length.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_length.c 2006-01-27 14:48:08.000000000 +0300 +@@ -8,6 +8,7 @@ + + #include <linux/module.h> + #include <linux/skbuff.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter_ipv4/ipt_length.h> + #include <linux/netfilter_ipv4/ip_tables.h> +@@ -50,14 +51,36 @@ static struct ipt_match length_match = { + .me = THIS_MODULE, + }; + ++int init_iptable_length(void) ++{ ++ return virt_ipt_register_match(&length_match); ++} ++ ++void fini_iptable_length(void) ++{ ++ virt_ipt_unregister_match(&length_match); ++} ++ + static int __init init(void) + { +- return ipt_register_match(&length_match); ++ int err; ++ ++ err = init_iptable_length(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_length); ++ KSYMRESOLVE(fini_iptable_length); ++ KSYMMODRESOLVE(ipt_length); ++ return 0; + } + + static void __exit fini(void) + { +- ipt_unregister_match(&length_match); ++ KSYMMODUNRESOLVE(ipt_length); ++ KSYMUNRESOLVE(init_iptable_length); ++ KSYMUNRESOLVE(fini_iptable_length); ++ fini_iptable_length(); + } + + module_init(init); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ipt_limit.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_limit.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ipt_limit.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_limit.c 2006-01-27 14:48:08.000000000 +0300 +@@ -17,6 +17,7 @@ + #include <linux/skbuff.h> + #include <linux/spinlock.h> + #include <linux/interrupt.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter_ipv4/ip_tables.h> + #include <linux/netfilter_ipv4/ipt_limit.h> +@@ -25,6 +26,13 @@ MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>"); + MODULE_DESCRIPTION("iptables rate limit match"); + ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_ipt_limit_reg (*(get_exec_env()->_ipt_limit_reg)) ++#else ++#define ve_ipt_limit_reg ipt_limit_reg ++#endif ++ + /* The algorithm used is the Simple Token Bucket Filter (TBF) + * see net/sched/sch_tbf.c in the linux source tree + */ +@@ -116,7 +124,7 @@ ipt_limit_checkentry(const char *tablena + /* Check for overflow. */ + if (r->burst == 0 + || user2credits(r->avg * r->burst) < user2credits(r->avg)) { +- printk("Overflow in ipt_limit, try lower: %u/%u\n", ++ ve_printk(VE_LOG, "Overflow in ipt_limit, try lower: %u/%u\n", + r->avg, r->burst); + return 0; + } +@@ -141,16 +149,36 @@ static struct ipt_match ipt_limit_reg = + .me = THIS_MODULE, + }; + ++int init_iptable_limit(void) ++{ ++ return virt_ipt_register_match(&ipt_limit_reg); ++} ++ ++void fini_iptable_limit(void) ++{ ++ virt_ipt_unregister_match(&ipt_limit_reg); ++} ++ + static int __init init(void) + { +- if (ipt_register_match(&ipt_limit_reg)) +- return -EINVAL; ++ int err; ++ ++ err = init_iptable_limit(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_limit); ++ KSYMRESOLVE(fini_iptable_limit); ++ KSYMMODRESOLVE(ipt_limit); + return 0; + } + + static void __exit fini(void) + { +- ipt_unregister_match(&ipt_limit_reg); ++ KSYMMODUNRESOLVE(ipt_limit); ++ KSYMUNRESOLVE(init_iptable_limit); ++ KSYMUNRESOLVE(fini_iptable_limit); ++ fini_iptable_limit(); + } + + module_init(init); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ipt_mac.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_mac.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ipt_mac.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_mac.c 2006-01-27 14:48:08.000000000 +0300 +@@ -48,7 +48,8 @@ ipt_mac_checkentry(const char *tablename + if (hook_mask + & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN) + | (1 << NF_IP_FORWARD))) { +- printk("ipt_mac: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n"); ++ ve_printk(VE_LOG, "ipt_mac: only valid for PRE_ROUTING, " ++ "LOCAL_IN or FORWARD.\n"); + return 0; + } + +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ipt_multiport.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_multiport.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ipt_multiport.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_multiport.c 2006-01-27 14:48:08.000000000 +0300 +@@ -13,6 +13,7 @@ + #include <linux/types.h> + #include <linux/udp.h> + #include <linux/skbuff.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter_ipv4/ipt_multiport.h> + #include <linux/netfilter_ipv4/ip_tables.h> +@@ -21,6 +22,13 @@ MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); + MODULE_DESCRIPTION("iptables multiple port match module"); + ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_multiport_match (*(get_exec_env()->_multiport_match)) ++#else ++#define ve_multiport_match multiport_match ++#endif ++ + #if 0 + #define duprintf(format, args...) printk(format , ## args) + #else +@@ -188,24 +196,45 @@ static struct ipt_match multiport_match_ + .me = THIS_MODULE, + }; + +-static int __init init(void) ++int init_iptable_multiport(void) + { + int err; + +- err = ipt_register_match(&multiport_match); ++ err = virt_ipt_register_match(&multiport_match); + if (!err) { +- err = ipt_register_match(&multiport_match_v1); ++ err = virt_ipt_register_match(&multiport_match_v1); + if (err) +- ipt_unregister_match(&multiport_match); ++ virt_ipt_unregister_match(&multiport_match); + } +- + return err; + } + ++void fini_iptable_multiport(void) ++{ ++ virt_ipt_unregister_match(&multiport_match); ++ virt_ipt_unregister_match(&multiport_match_v1); ++} ++ ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_iptable_multiport(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_multiport); ++ KSYMRESOLVE(fini_iptable_multiport); ++ KSYMMODRESOLVE(ipt_multiport); ++ return 0; ++} ++ + static void __exit fini(void) + { +- ipt_unregister_match(&multiport_match); +- ipt_unregister_match(&multiport_match_v1); ++ KSYMMODUNRESOLVE(ipt_multiport); ++ KSYMUNRESOLVE(init_iptable_multiport); ++ KSYMUNRESOLVE(fini_iptable_multiport); ++ fini_iptable_multiport(); + } + + module_init(init); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ipt_state.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_state.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ipt_state.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_state.c 2006-01-27 14:48:08.000000000 +0300 +@@ -10,6 +10,7 @@ + + #include <linux/module.h> + #include <linux/skbuff.h> ++#include <linux/nfcalls.h> + #include <net/netfilter/nf_conntrack_compat.h> + #include <linux/netfilter_ipv4/ip_tables.h> + #include <linux/netfilter_ipv4/ipt_state.h> +@@ -59,15 +60,37 @@ static struct ipt_match state_match = { + .me = THIS_MODULE, + }; + ++int init_iptable_state(void) ++{ ++ return virt_ipt_register_match(&state_match); ++} ++ ++void fini_iptable_state(void) ++{ ++ virt_ipt_unregister_match(&state_match); ++} ++ + static int __init init(void) + { ++ int err; ++ + need_ip_conntrack(); +- return ipt_register_match(&state_match); ++ err = init_iptable_state(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_state); ++ KSYMRESOLVE(fini_iptable_state); ++ KSYMMODRESOLVE(ipt_state); ++ return 0; + } + + static void __exit fini(void) + { +- ipt_unregister_match(&state_match); ++ KSYMMODUNRESOLVE(ipt_state); ++ KSYMUNRESOLVE(init_iptable_state); ++ KSYMUNRESOLVE(fini_iptable_state); ++ fini_iptable_state(); + } + + module_init(init); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ipt_tcpmss.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_tcpmss.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ipt_tcpmss.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_tcpmss.c 2006-01-27 14:48:08.000000000 +0300 +@@ -10,6 +10,7 @@ + #include <linux/module.h> + #include <linux/skbuff.h> + #include <net/tcp.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter_ipv4/ipt_tcpmss.h> + #include <linux/netfilter_ipv4/ip_tables.h> +@@ -99,7 +100,7 @@ checkentry(const char *tablename, + + /* Must specify -p tcp */ + if (ip->proto != IPPROTO_TCP || (ip->invflags & IPT_INV_PROTO)) { +- printk("tcpmss: Only works on TCP packets\n"); ++ ve_printk(VE_LOG, "tcpmss: Only works on TCP packets\n"); + return 0; + } + +@@ -113,14 +114,36 @@ static struct ipt_match tcpmss_match = { + .me = THIS_MODULE, + }; + ++int init_iptable_tcpmss(void) ++{ ++ return virt_ipt_register_match(&tcpmss_match); ++} ++ ++void fini_iptable_tcpmss(void) ++{ ++ virt_ipt_unregister_match(&tcpmss_match); ++} ++ + static int __init init(void) + { +- return ipt_register_match(&tcpmss_match); ++ int err; ++ ++ err = init_iptable_tcpmss(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_tcpmss); ++ KSYMRESOLVE(fini_iptable_tcpmss); ++ KSYMMODRESOLVE(ipt_tcpmss); ++ return 0; + } + + static void __exit fini(void) + { +- ipt_unregister_match(&tcpmss_match); ++ KSYMMODUNRESOLVE(ipt_tcpmss); ++ KSYMUNRESOLVE(init_iptable_tcpmss); ++ KSYMUNRESOLVE(fini_iptable_tcpmss); ++ fini_iptable_tcpmss(); + } + + module_init(init); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ipt_tos.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_tos.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ipt_tos.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_tos.c 2006-01-27 14:48:08.000000000 +0300 +@@ -10,6 +10,7 @@ + + #include <linux/module.h> + #include <linux/skbuff.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter_ipv4/ipt_tos.h> + #include <linux/netfilter_ipv4/ip_tables.h> +@@ -17,6 +18,13 @@ + MODULE_LICENSE("GPL"); + MODULE_DESCRIPTION("iptables TOS match module"); + ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_tos_match (*(get_exec_env()->_tos_match)) ++#else ++#define ve_tos_match tos_match ++#endif ++ + static int + match(const struct sk_buff *skb, + const struct net_device *in, +@@ -50,14 +58,36 @@ static struct ipt_match tos_match = { + .me = THIS_MODULE, + }; + ++int init_iptable_tos(void) ++{ ++ return virt_ipt_register_match(&tos_match); ++} ++ ++void fini_iptable_tos(void) ++{ ++ virt_ipt_unregister_match(&tos_match); ++} ++ + static int __init init(void) + { +- return ipt_register_match(&tos_match); ++ int err; ++ ++ err = init_iptable_tos(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_tos); ++ KSYMRESOLVE(fini_iptable_tos); ++ KSYMMODRESOLVE(ipt_tos); ++ return 0; + } + + static void __exit fini(void) + { +- ipt_unregister_match(&tos_match); ++ KSYMMODUNRESOLVE(ipt_tos); ++ KSYMUNRESOLVE(init_iptable_tos); ++ KSYMUNRESOLVE(fini_iptable_tos); ++ fini_iptable_tos(); + } + + module_init(init); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/ipt_ttl.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_ttl.c +--- linux-2.6.15.orig/net/ipv4/netfilter/ipt_ttl.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/ipt_ttl.c 2006-01-27 14:48:08.000000000 +0300 +@@ -11,6 +11,7 @@ + + #include <linux/module.h> + #include <linux/skbuff.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter_ipv4/ipt_ttl.h> + #include <linux/netfilter_ipv4/ip_tables.h> +@@ -64,15 +65,36 @@ static struct ipt_match ttl_match = { + .me = THIS_MODULE, + }; + ++int init_iptable_ttl(void) ++{ ++ return virt_ipt_register_match(&ttl_match); ++} ++ ++void fini_iptable_ttl(void) ++{ ++ virt_ipt_unregister_match(&ttl_match); ++} ++ + static int __init init(void) + { +- return ipt_register_match(&ttl_match); ++ int err; ++ ++ err = init_iptable_ttl(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_ttl); ++ KSYMRESOLVE(fini_iptable_ttl); ++ KSYMMODRESOLVE(ipt_ttl); ++ return 0; + } + + static void __exit fini(void) + { +- ipt_unregister_match(&ttl_match); +- ++ KSYMMODUNRESOLVE(ipt_ttl); ++ KSYMUNRESOLVE(init_iptable_ttl); ++ KSYMUNRESOLVE(fini_iptable_ttl); ++ fini_iptable_ttl(); + } + + module_init(init); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/iptable_filter.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/iptable_filter.c +--- linux-2.6.15.orig/net/ipv4/netfilter/iptable_filter.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/iptable_filter.c 2006-01-27 14:48:08.000000000 +0300 +@@ -12,12 +12,20 @@ + + #include <linux/module.h> + #include <linux/moduleparam.h> ++#include <linux/nfcalls.h> + #include <linux/netfilter_ipv4/ip_tables.h> + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); + MODULE_DESCRIPTION("iptables filter table"); + ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_packet_filter (get_exec_env()->_ve_ipt_filter_pf) ++#else ++#define ve_packet_filter &packet_filter ++#endif ++ + #define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT)) + + static struct +@@ -25,7 +33,7 @@ static struct + struct ipt_replace repl; + struct ipt_standard entries[3]; + struct ipt_error term; +-} initial_table __initdata ++} initial_table + = { { "filter", FILTER_VALID_HOOKS, 4, + sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), + { [NF_IP_LOCAL_IN] = 0, +@@ -89,7 +97,7 @@ ipt_hook(unsigned int hook, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) + { +- return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL); ++ return ipt_do_table(pskb, hook, in, out, ve_packet_filter, NULL); + } + + static unsigned int +@@ -107,7 +115,7 @@ ipt_local_out_hook(unsigned int hook, + return NF_ACCEPT; + } + +- return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL); ++ return ipt_do_table(pskb, hook, in, out, ve_packet_filter, NULL); + } + + static struct nf_hook_ops ipt_ops[] = { +@@ -138,56 +146,89 @@ static struct nf_hook_ops ipt_ops[] = { + static int forward = NF_ACCEPT; + module_param(forward, bool, 0000); + +-static int __init init(void) ++int init_iptable_filter(void) + { + int ret; +- +- if (forward < 0 || forward > NF_MAX_VERDICT) { +- printk("iptables forward must be 0 or 1\n"); +- return -EINVAL; +- } +- +- /* Entry 1 is the FORWARD hook */ +- initial_table.entries[1].target.verdict = -forward - 1; ++ struct ipt_table *tmp_filter; + + /* Register table */ +- ret = ipt_register_table(&packet_filter, &initial_table.repl); +- if (ret < 0) +- return ret; ++ tmp_filter = virt_ipt_register_table(&packet_filter, ++ &initial_table.repl); ++ if (IS_ERR(tmp_filter)) ++ return PTR_ERR(tmp_filter); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_filter = tmp_filter; ++#endif + + /* Register hooks */ +- ret = nf_register_hook(&ipt_ops[0]); ++ ret = virt_nf_register_hook(&ipt_ops[0]); + if (ret < 0) + goto cleanup_table; + +- ret = nf_register_hook(&ipt_ops[1]); ++ ret = virt_nf_register_hook(&ipt_ops[1]); + if (ret < 0) + goto cleanup_hook0; + +- ret = nf_register_hook(&ipt_ops[2]); ++ ret = virt_nf_register_hook(&ipt_ops[2]); + if (ret < 0) + goto cleanup_hook1; + + return ret; + + cleanup_hook1: +- nf_unregister_hook(&ipt_ops[1]); ++ virt_nf_unregister_hook(&ipt_ops[1]); + cleanup_hook0: +- nf_unregister_hook(&ipt_ops[0]); ++ virt_nf_unregister_hook(&ipt_ops[0]); + cleanup_table: +- ipt_unregister_table(&packet_filter); ++ virt_ipt_unregister_table(ve_packet_filter); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_filter = NULL; ++#endif + + return ret; + } + +-static void __exit fini(void) ++void fini_iptable_filter(void) + { + unsigned int i; + + for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) +- nf_unregister_hook(&ipt_ops[i]); ++ virt_nf_unregister_hook(&ipt_ops[i]); + +- ipt_unregister_table(&packet_filter); ++ virt_ipt_unregister_table(ve_packet_filter); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_filter = NULL; ++#endif ++} ++ ++static int __init init(void) ++{ ++ int err; ++ ++ if (forward < 0 || forward > NF_MAX_VERDICT) { ++ printk("iptables forward must be 0 or 1\n"); ++ return -EINVAL; ++ } ++ ++ /* Entry 1 is the FORWARD hook */ ++ initial_table.entries[1].target.verdict = -forward - 1; ++ ++ err = init_iptable_filter(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_filter); ++ KSYMRESOLVE(fini_iptable_filter); ++ KSYMMODRESOLVE(iptable_filter); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(iptable_filter); ++ KSYMUNRESOLVE(init_iptable_filter); ++ KSYMUNRESOLVE(fini_iptable_filter); ++ fini_iptable_filter(); + } + + module_init(init); +diff -uprN linux-2.6.15.orig/net/ipv4/netfilter/iptable_mangle.c linux-2.6.15-ve025stab014/net/ipv4/netfilter/iptable_mangle.c +--- linux-2.6.15.orig/net/ipv4/netfilter/iptable_mangle.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/netfilter/iptable_mangle.c 2006-01-27 14:48:08.000000000 +0300 +@@ -17,6 +17,7 @@ + #include <linux/skbuff.h> + #include <net/sock.h> + #include <net/route.h> ++#include <linux/nfcalls.h> + #include <linux/ip.h> + + MODULE_LICENSE("GPL"); +@@ -35,7 +36,7 @@ static struct + struct ipt_replace repl; + struct ipt_standard entries[5]; + struct ipt_error term; +-} initial_table __initdata ++} initial_table + = { { "mangle", MANGLE_VALID_HOOKS, 6, + sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error), + { [NF_IP_PRE_ROUTING] = 0, +@@ -111,6 +112,13 @@ static struct ipt_table packet_mangler = + .me = THIS_MODULE, + }; + ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_packet_mangler (get_exec_env()->_ipt_mangle_table) ++#else ++#define ve_packet_mangler &packet_mangler ++#endif ++ + /* The work comes in here from netfilter.c. */ + static unsigned int + ipt_route_hook(unsigned int hook, +@@ -119,7 +127,7 @@ ipt_route_hook(unsigned int hook, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) + { +- return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL); ++ return ipt_do_table(pskb, hook, in, out, ve_packet_mangler, NULL); + } + + static unsigned int +@@ -148,7 +156,8 @@ ipt_local_hook(unsigned int hook, + daddr = (*pskb)->nh.iph->daddr; + tos = (*pskb)->nh.iph->tos; + +- ret = ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL); ++ ret = ipt_do_table(pskb, hook, in, out, ve_packet_mangler, NULL); ++ + /* Reroute for ANY change. */ + if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE + && ((*pskb)->nh.iph->saddr != saddr +@@ -200,60 +209,103 @@ static struct nf_hook_ops ipt_ops[] = { + }, + }; + +-static int __init init(void) ++static int mangle_init(struct nf_hook_ops ipt_ops[]) + { + int ret; ++ struct ipt_table *tmp_mangler; + + /* Register table */ +- ret = ipt_register_table(&packet_mangler, &initial_table.repl); +- if (ret < 0) +- return ret; ++ tmp_mangler = virt_ipt_register_table(&packet_mangler, ++ &initial_table.repl); ++ if (IS_ERR(tmp_mangler)) ++ return PTR_ERR(tmp_mangler); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_mangler = tmp_mangler; ++#endif + + /* Register hooks */ +- ret = nf_register_hook(&ipt_ops[0]); ++ ret = virt_nf_register_hook(&ipt_ops[0]); + if (ret < 0) + goto cleanup_table; + +- ret = nf_register_hook(&ipt_ops[1]); ++ ret = virt_nf_register_hook(&ipt_ops[1]); + if (ret < 0) + goto cleanup_hook0; + +- ret = nf_register_hook(&ipt_ops[2]); ++ ret = virt_nf_register_hook(&ipt_ops[2]); + if (ret < 0) + goto cleanup_hook1; + +- ret = nf_register_hook(&ipt_ops[3]); ++ ret = virt_nf_register_hook(&ipt_ops[3]); + if (ret < 0) + goto cleanup_hook2; + +- ret = nf_register_hook(&ipt_ops[4]); ++ ret = virt_nf_register_hook(&ipt_ops[4]); + if (ret < 0) + goto cleanup_hook3; + + return ret; + + cleanup_hook3: +- nf_unregister_hook(&ipt_ops[3]); ++ virt_nf_unregister_hook(&ipt_ops[3]); + cleanup_hook2: +- nf_unregister_hook(&ipt_ops[2]); ++ virt_nf_unregister_hook(&ipt_ops[2]); + cleanup_hook1: +- nf_unregister_hook(&ipt_ops[1]); ++ virt_nf_unregister_hook(&ipt_ops[1]); + cleanup_hook0: +- nf_unregister_hook(&ipt_ops[0]); ++ virt_nf_unregister_hook(&ipt_ops[0]); + cleanup_table: +- ipt_unregister_table(&packet_mangler); ++ virt_ipt_unregister_table(ve_packet_mangler); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_mangler = NULL; ++#endif + + return ret; + } + +-static void __exit fini(void) ++static void mangle_fini(struct nf_hook_ops ipt_ops[]) + { + unsigned int i; + +- for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) +- nf_unregister_hook(&ipt_ops[i]); ++ for (i = 0; i < 5; i++) ++ virt_nf_unregister_hook(&ipt_ops[i]); ++ ++ virt_ipt_unregister_table(ve_packet_mangler); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_mangler = NULL; ++#endif ++} ++ ++static int init_iptable_mangle(void) ++{ ++ return mangle_init(ipt_ops); ++} ++ ++static void fini_iptable_mangle(void) ++{ ++ mangle_fini(ipt_ops); ++} ++ ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_iptable_mangle(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_mangle); ++ KSYMRESOLVE(fini_iptable_mangle); ++ KSYMMODRESOLVE(iptable_mangle); ++ return 0; ++} + +- ipt_unregister_table(&packet_mangler); ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(iptable_mangle); ++ KSYMUNRESOLVE(init_iptable_mangle); ++ KSYMUNRESOLVE(fini_iptable_mangle); ++ fini_iptable_mangle(); + } + + module_init(init); +diff -uprN linux-2.6.15.orig/net/ipv4/proc.c linux-2.6.15-ve025stab014/net/ipv4/proc.c +--- linux-2.6.15.orig/net/ipv4/proc.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/proc.c 2006-01-27 14:48:08.000000000 +0300 +@@ -257,11 +257,12 @@ static int snmp_seq_show(struct seq_file + seq_printf(seq, " %s", snmp4_ipstats_list[i].name); + + seq_printf(seq, "\nIp: %d %d", +- ipv4_devconf.forwarding ? 1 : 2, sysctl_ip_default_ttl); ++ ve_ipv4_devconf.forwarding ? 1 : 2, ++ sysctl_ip_default_ttl); + + for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) + seq_printf(seq, " %lu", +- fold_field((void **) ip_statistics, ++ fold_field((void **) ve_ip_statistics, + snmp4_ipstats_list[i].entry)); + + seq_puts(seq, "\nIcmp:"); +@@ -271,7 +272,7 @@ static int snmp_seq_show(struct seq_file + seq_puts(seq, "\nIcmp:"); + for (i = 0; snmp4_icmp_list[i].name != NULL; i++) + seq_printf(seq, " %lu", +- fold_field((void **) icmp_statistics, ++ fold_field((void **) ve_icmp_statistics, + snmp4_icmp_list[i].entry)); + + seq_puts(seq, "\nTcp:"); +@@ -283,11 +284,11 @@ static int snmp_seq_show(struct seq_file + /* MaxConn field is signed, RFC 2012 */ + if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) + seq_printf(seq, " %ld", +- fold_field((void **) tcp_statistics, ++ fold_field((void **) ve_tcp_statistics, + snmp4_tcp_list[i].entry)); + else + seq_printf(seq, " %lu", +- fold_field((void **) tcp_statistics, ++ fold_field((void **) ve_tcp_statistics, + snmp4_tcp_list[i].entry)); + } + +@@ -298,7 +299,7 @@ static int snmp_seq_show(struct seq_file + seq_puts(seq, "\nUdp:"); + for (i = 0; snmp4_udp_list[i].name != NULL; i++) + seq_printf(seq, " %lu", +- fold_field((void **) udp_statistics, ++ fold_field((void **) ve_udp_statistics, + snmp4_udp_list[i].entry)); + + seq_putc(seq, '\n'); +@@ -332,7 +333,7 @@ static int netstat_seq_show(struct seq_f + seq_puts(seq, "\nTcpExt:"); + for (i = 0; snmp4_net_list[i].name != NULL; i++) + seq_printf(seq, " %lu", +- fold_field((void **) net_statistics, ++ fold_field((void **) ve_net_statistics, + snmp4_net_list[i].entry)); + + seq_putc(seq, '\n'); +@@ -356,10 +357,10 @@ int __init ip_misc_proc_init(void) + { + int rc = 0; + +- if (!proc_net_fops_create("netstat", S_IRUGO, &netstat_seq_fops)) ++ if (!proc_glob_fops_create("net/netstat", S_IRUGO, &netstat_seq_fops)) + goto out_netstat; + +- if (!proc_net_fops_create("snmp", S_IRUGO, &snmp_seq_fops)) ++ if (!proc_glob_fops_create("net/snmp", S_IRUGO, &snmp_seq_fops)) + goto out_snmp; + + if (!proc_net_fops_create("sockstat", S_IRUGO, &sockstat_seq_fops)) +@@ -367,9 +368,9 @@ int __init ip_misc_proc_init(void) + out: + return rc; + out_sockstat: +- proc_net_remove("snmp"); ++ remove_proc_glob_entry("net/snmp", NULL); + out_snmp: +- proc_net_remove("netstat"); ++ remove_proc_glob_entry("net/netstat", NULL); + out_netstat: + rc = -ENOMEM; + goto out; +diff -uprN linux-2.6.15.orig/net/ipv4/raw.c linux-2.6.15-ve025stab014/net/ipv4/raw.c +--- linux-2.6.15.orig/net/ipv4/raw.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/raw.c 2006-01-27 14:48:08.000000000 +0300 +@@ -114,7 +114,8 @@ struct sock *__raw_v4_lookup(struct sock + if (inet->num == num && + !(inet->daddr && inet->daddr != raddr) && + !(inet->rcv_saddr && inet->rcv_saddr != laddr) && +- !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) ++ !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) && ++ ve_accessible_strict(VE_OWNER_SK(sk), get_exec_env())) + goto found; /* gotcha */ + } + sk = NULL; +@@ -752,8 +753,12 @@ static struct sock *raw_get_first(struct + struct hlist_node *node; + + sk_for_each(sk, node, &raw_v4_htable[state->bucket]) +- if (sk->sk_family == PF_INET) ++ if (sk->sk_family == PF_INET) { ++ if (!ve_accessible(VE_OWNER_SK(sk), ++ get_exec_env())) ++ continue; + goto found; ++ } + } + sk = NULL; + found: +@@ -767,8 +772,14 @@ static struct sock *raw_get_next(struct + do { + sk = sk_next(sk); + try_again: +- ; +- } while (sk && sk->sk_family != PF_INET); ++ if (!sk) ++ break; ++ if (sk->sk_family != PF_INET) ++ continue; ++ if (ve_accessible(VE_OWNER_SK(sk), ++ get_exec_env())) ++ break; ++ } while (1); + + if (!sk && ++state->bucket < RAWV4_HTABLE_SIZE) { + sk = sk_head(&raw_v4_htable[state->bucket]); +@@ -885,13 +896,13 @@ static struct file_operations raw_seq_fo + + int __init raw_proc_init(void) + { +- if (!proc_net_fops_create("raw", S_IRUGO, &raw_seq_fops)) ++ if (!proc_glob_fops_create("net/raw", S_IRUGO, &raw_seq_fops)) + return -ENOMEM; + return 0; + } + + void __init raw_proc_exit(void) + { +- proc_net_remove("raw"); ++ remove_proc_glob_entry("net/raw", NULL); + } + #endif /* CONFIG_PROC_FS */ +diff -uprN linux-2.6.15.orig/net/ipv4/route.c linux-2.6.15-ve025stab014/net/ipv4/route.c +--- linux-2.6.15.orig/net/ipv4/route.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/route.c 2006-01-27 14:48:08.000000000 +0300 +@@ -114,6 +114,8 @@ + + #define RT_GC_TIMEOUT (300*HZ) + ++int ip_rt_src_check = 1; ++ + static int ip_rt_min_delay = 2 * HZ; + static int ip_rt_max_delay = 10 * HZ; + static int ip_rt_max_size; +@@ -253,11 +255,28 @@ static unsigned int rt_hash_code(u32 dad + & rt_hash_mask); + } + ++void prepare_rt_cache(void) ++{ ++#ifdef CONFIG_VE ++ struct rtable *r; ++ int i; ++ ++ for (i = rt_hash_mask; i >= 0; i--) { ++ spin_lock_bh(rt_hash_lock_addr(i)); ++ for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) { ++ r->fl.owner_env = get_ve0(); ++ } ++ spin_unlock_bh(rt_hash_lock_addr(i)); ++ } ++#endif ++} ++ + #ifdef CONFIG_PROC_FS + struct rt_cache_iter_state { + int bucket; + }; + ++static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r); + static struct rtable *rt_cache_get_first(struct seq_file *seq) + { + struct rtable *r = NULL; +@@ -270,6 +289,8 @@ static struct rtable *rt_cache_get_first + break; + rcu_read_unlock_bh(); + } ++ if (r && !ve_accessible_strict(r->fl.owner_env, get_exec_env())) ++ r = rt_cache_get_next(seq, r); + return r; + } + +@@ -277,14 +298,19 @@ static struct rtable *rt_cache_get_next( + { + struct rt_cache_iter_state *st = rcu_dereference(seq->private); + +- r = r->u.rt_next; ++start: ++ do { ++ r = r->u.rt_next; ++ } while (r && !ve_accessible_strict(r->fl.owner_env, get_exec_env())); + while (!r) { + rcu_read_unlock_bh(); + if (--st->bucket < 0) +- break; ++ goto out; + rcu_read_lock_bh(); + r = rt_hash_table[st->bucket].chain; + } ++ goto start; ++out: + return r; + } + +@@ -556,7 +582,8 @@ static inline int compare_keys(struct fl + { + return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 && + fl1->oif == fl2->oif && +- fl1->iif == fl2->iif; ++ fl1->iif == fl2->iif && ++ ve_accessible_strict(fl1->owner_env, fl2->owner_env); + } + + #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED +@@ -670,26 +697,105 @@ static void rt_check_expire(unsigned lon + mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval); + } + ++typedef unsigned long rt_flush_gen_t; ++ ++#ifdef CONFIG_VE ++ ++static rt_flush_gen_t rt_flush_gen; ++ ++/* called under rt_flush_lock */ ++static void set_rt_flush_required(struct ve_struct *env) ++{ ++ /* ++ * If the global generation rt_flush_gen is equal to G, then ++ * the pass considering entries labelled by G is yet to come. ++ */ ++ env->rt_flush_required = rt_flush_gen; ++} ++ ++static spinlock_t rt_flush_lock; ++static rt_flush_gen_t reset_rt_flush_required(void) ++{ ++ rt_flush_gen_t g; ++ ++ spin_lock_bh(&rt_flush_lock); ++ g = rt_flush_gen++; ++ spin_unlock_bh(&rt_flush_lock); ++ return g; ++} ++ ++static int check_rt_flush_required(struct ve_struct *env, rt_flush_gen_t gen) ++{ ++ /* can be checked without the lock */ ++ return env->rt_flush_required >= gen; ++} ++ ++#else ++ ++static void set_rt_flush_required(struct ve_struct *env) ++{ ++} ++ ++static rt_flush_gen_t reset_rt_flush_required(void) ++{ ++ return 0; ++} ++ ++#endif ++ + /* This can run from both BH and non-BH contexts, the latter + * in the case of a forced flush event. + */ + static void rt_run_flush(unsigned long dummy) + { + int i; +- struct rtable *rth, *next; ++ struct rtable * rth, * next; ++ struct rtable * tail; ++ rt_flush_gen_t gen; + + rt_deadline = 0; + + get_random_bytes(&rt_hash_rnd, 4); + ++ gen = reset_rt_flush_required(); ++ + for (i = rt_hash_mask; i >= 0; i--) { ++#ifdef CONFIG_VE ++ struct rtable ** prev, * p; ++ ++ spin_lock_bh(rt_hash_lock_addr(i)); ++ rth = rt_hash_table[i].chain; ++ ++ /* defer releasing the head of the list after spin_unlock */ ++ for (tail = rth; tail; tail = tail->u.rt_next) ++ if (!check_rt_flush_required(tail->fl.owner_env, gen)) ++ break; ++ if (rth != tail) ++ rt_hash_table[i].chain = tail; ++ ++ /* call rt_free on entries after the tail requiring flush */ ++ prev = &rt_hash_table[i].chain; ++ for (p = *prev; p; p = next) { ++ next = p->u.rt_next; ++ if (!check_rt_flush_required(p->fl.owner_env, gen)) { ++ prev = &p->u.rt_next; ++ } else { ++ *prev = next; ++ rt_free(p); ++ } ++ } ++ ++#else + spin_lock_bh(rt_hash_lock_addr(i)); + rth = rt_hash_table[i].chain; + if (rth) + rt_hash_table[i].chain = NULL; ++ tail = NULL; ++ ++#endif + spin_unlock_bh(rt_hash_lock_addr(i)); + +- for (; rth; rth = next) { ++ for (; rth != tail; rth = next) { + next = rth->u.rt_next; + rt_free(rth); + } +@@ -728,6 +834,8 @@ void rt_cache_flush(int delay) + delay = tmo; + } + ++ set_rt_flush_required(get_exec_env()); ++ + if (delay <= 0) { + spin_unlock_bh(&rt_flush_lock); + rt_run_flush(0); +@@ -743,9 +851,30 @@ void rt_cache_flush(int delay) + + static void rt_secret_rebuild(unsigned long dummy) + { ++ int i; ++ struct rtable *rth, *next; + unsigned long now = jiffies; + +- rt_cache_flush(0); ++ spin_lock_bh(&rt_flush_lock); ++ del_timer(&rt_flush_timer); ++ spin_unlock_bh(&rt_flush_lock); ++ ++ rt_deadline = 0; ++ get_random_bytes(&rt_hash_rnd, 4); ++ ++ for (i = rt_hash_mask; i >= 0; i--) { ++ spin_lock_bh(rt_hash_lock_addr(i)); ++ rth = rt_hash_table[i].chain; ++ if (rth) ++ rt_hash_table[i].chain = NULL; ++ spin_unlock_bh(rt_hash_lock_addr(i)); ++ ++ for (; rth; rth = next) { ++ next = rth->u.rt_next; ++ rt_free(rth); ++ } ++ } ++ + mod_timer(&rt_secret_timer, now + ip_rt_secret_interval); + } + +@@ -1118,7 +1247,9 @@ void ip_rt_redirect(u32 old_gw, u32 dadd + struct rtable *rth, **rthp; + u32 skeys[2] = { saddr, 0 }; + int ikeys[2] = { dev->ifindex, 0 }; ++ struct ve_struct *ve; + ++ ve = get_exec_env(); + tos &= IPTOS_RT_MASK; + + if (!in_dev) +@@ -1154,6 +1285,10 @@ void ip_rt_redirect(u32 old_gw, u32 dadd + rth->fl.fl4_src != skeys[i] || + rth->fl.fl4_tos != tos || + rth->fl.oif != ikeys[k] || ++#ifdef CONFIG_VE ++ !ve_accessible_strict(rth->fl.owner_env, ++ ve) || ++#endif + rth->fl.iif != 0) { + rthp = &rth->u.rt_next; + continue; +@@ -1192,6 +1327,9 @@ void ip_rt_redirect(u32 old_gw, u32 dadd + rt->u.dst.neighbour = NULL; + rt->u.dst.hh = NULL; + rt->u.dst.xfrm = NULL; ++#ifdef CONFIG_VE ++ rt->fl.owner_env = ve; ++#endif + + rt->rt_flags |= RTCF_REDIRECTED; + +@@ -1631,6 +1769,9 @@ static int ip_route_input_mc(struct sk_b + #ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark= skb->nfmark; + #endif ++#ifdef CONFIG_VE ++ rth->fl.owner_env = get_exec_env(); ++#endif + rth->fl.fl4_src = saddr; + rth->rt_src = saddr; + #ifdef CONFIG_NET_CLS_ROUTE +@@ -1638,7 +1779,7 @@ static int ip_route_input_mc(struct sk_b + #endif + rth->rt_iif = + rth->fl.iif = dev->ifindex; +- rth->u.dst.dev = &loopback_dev; ++ rth->u.dst.dev = &visible_loopback_dev; + dev_hold(rth->u.dst.dev); + rth->idev = in_dev_get(rth->u.dst.dev); + rth->fl.oif = 0; +@@ -1776,6 +1917,9 @@ static inline int __mkroute_input(struct + #ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark= skb->nfmark; + #endif ++#ifdef CONFIG_VE ++ rth->fl.owner_env = get_exec_env(); ++#endif + rth->fl.fl4_src = saddr; + rth->rt_src = saddr; + rth->rt_gateway = daddr; +@@ -1959,7 +2103,7 @@ static int ip_route_input_slow(struct sk + if (res.type == RTN_LOCAL) { + int result; + result = fib_validate_source(saddr, daddr, tos, +- loopback_dev.ifindex, ++ visible_loopback_dev.ifindex, + dev, &spec_dst, &itag); + if (result < 0) + goto martian_source; +@@ -2021,6 +2165,9 @@ local_input: + #ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark= skb->nfmark; + #endif ++#ifdef CONFIG_VE ++ rth->fl.owner_env = get_exec_env(); ++#endif + rth->fl.fl4_src = saddr; + rth->rt_src = saddr; + #ifdef CONFIG_NET_CLS_ROUTE +@@ -2028,7 +2175,7 @@ local_input: + #endif + rth->rt_iif = + rth->fl.iif = dev->ifindex; +- rth->u.dst.dev = &loopback_dev; ++ rth->u.dst.dev = &visible_loopback_dev; + dev_hold(rth->u.dst.dev); + rth->idev = in_dev_get(rth->u.dst.dev); + rth->rt_gateway = daddr; +@@ -2100,6 +2247,9 @@ int ip_route_input(struct sk_buff *skb, + #ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark == skb->nfmark && + #endif ++#ifdef CONFIG_VE ++ rth->fl.owner_env == get_exec_env() && ++#endif + rth->fl.fl4_tos == tos) { + rth->u.dst.lastuse = jiffies; + dst_hold(&rth->u.dst); +@@ -2226,6 +2376,9 @@ static inline int __mkroute_output(struc + #ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark= oldflp->fl4_fwmark; + #endif ++#ifdef CONFIG_VE ++ rth->fl.owner_env = get_exec_env(); ++#endif + rth->rt_dst = fl->fl4_dst; + rth->rt_src = fl->fl4_src; + rth->rt_iif = oldflp->oif ? : dev_out->ifindex; +@@ -2378,7 +2531,7 @@ static int ip_route_output_slow(struct r + .fwmark = oldflp->fl4_fwmark + #endif + } }, +- .iif = loopback_dev.ifindex, ++ .iif = visible_loopback_dev.ifindex, + .oif = oldflp->oif }; + struct fib_result res; + unsigned flags = 0; +@@ -2399,10 +2552,13 @@ static int ip_route_output_slow(struct r + ZERONET(oldflp->fl4_src)) + goto out; + +- /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ +- dev_out = ip_dev_find(oldflp->fl4_src); +- if (dev_out == NULL) +- goto out; ++ if (ip_rt_src_check) { ++ /* It is equivalent to ++ inet_addr_type(saddr) == RTN_LOCAL */ ++ dev_out = ip_dev_find(oldflp->fl4_src); ++ if (dev_out == NULL) ++ goto out; ++ } + + /* I removed check for oif == dev_out->oif here. + It was wrong for two reasons: +@@ -2429,6 +2585,12 @@ static int ip_route_output_slow(struct r + Luckily, this hack is good workaround. + */ + ++ if (dev_out == NULL) { ++ dev_out = ip_dev_find(oldflp->fl4_src); ++ if (dev_out == NULL) ++ goto out; ++ } ++ + fl.oif = dev_out->ifindex; + goto make_route; + } +@@ -2472,9 +2634,9 @@ static int ip_route_output_slow(struct r + fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); + if (dev_out) + dev_put(dev_out); +- dev_out = &loopback_dev; ++ dev_out = &visible_loopback_dev; + dev_hold(dev_out); +- fl.oif = loopback_dev.ifindex; ++ fl.oif = visible_loopback_dev.ifindex; + res.type = RTN_LOCAL; + flags |= RTCF_LOCAL; + goto make_route; +@@ -2519,7 +2681,7 @@ static int ip_route_output_slow(struct r + fl.fl4_src = fl.fl4_dst; + if (dev_out) + dev_put(dev_out); +- dev_out = &loopback_dev; ++ dev_out = &visible_loopback_dev; + dev_hold(dev_out); + fl.oif = dev_out->ifindex; + if (res.fi) +@@ -2575,6 +2737,7 @@ int __ip_route_output_key(struct rtable + #ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark == flp->fl4_fwmark && + #endif ++ ve_accessible_strict(rth->fl.owner_env, get_exec_env()) && + !((rth->fl.fl4_tos ^ flp->fl4_tos) & + (IPTOS_RT_MASK | RTO_ONLINK))) { + +@@ -2705,7 +2868,7 @@ static int rt_fill_info(struct sk_buff * + u32 dst = rt->rt_dst; + + if (MULTICAST(dst) && !LOCAL_MCAST(dst) && +- ipv4_devconf.mc_forwarding) { ++ ve_ipv4_devconf.mc_forwarding) { + int err = ipmr_get_route(skb, r, nowait); + if (err <= 0) { + if (!nowait) { +@@ -2853,22 +3016,22 @@ void ip_rt_multicast_event(struct in_dev + } + + #ifdef CONFIG_SYSCTL +-static int flush_delay; ++int ipv4_flush_delay; + +-static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, ++int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) + { + if (write) { + proc_dointvec(ctl, write, filp, buffer, lenp, ppos); +- rt_cache_flush(flush_delay); ++ rt_cache_flush(ipv4_flush_delay); + return 0; + } + + return -EINVAL; + } + +-static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, ++int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, + int __user *name, + int nlen, + void __user *oldval, +@@ -2890,7 +3053,7 @@ ctl_table ipv4_route_table[] = { + { + .ctl_name = NET_IPV4_ROUTE_FLUSH, + .procname = "flush", +- .data = &flush_delay, ++ .data = &ipv4_flush_delay, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = &ipv4_sysctl_rtcache_flush, +@@ -3188,16 +3351,17 @@ int __init ip_rt_init(void) + #ifdef CONFIG_PROC_FS + { + struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */ +- if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) || +- !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, +- proc_net_stat))) { ++ if (!proc_glob_fops_create("net/rt_cache", ++ S_IRUGO, &rt_cache_seq_fops) || ++ !(rtstat_pde = create_proc_glob_entry("net/stat/rt_cache", ++ S_IRUGO, NULL))) { + free_percpu(rt_cache_stat); + return -ENOMEM; + } + rtstat_pde->proc_fops = &rt_cpu_seq_fops; + } + #ifdef CONFIG_NET_CLS_ROUTE +- create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL); ++ create_proc_read_entry("net/rt_acct", 0, NULL, ip_rt_acct_read, NULL); + #endif + #endif + #ifdef CONFIG_XFRM +diff -uprN linux-2.6.15.orig/net/ipv4/sysctl_net_ipv4.c linux-2.6.15-ve025stab014/net/ipv4/sysctl_net_ipv4.c +--- linux-2.6.15.orig/net/ipv4/sysctl_net_ipv4.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/sysctl_net_ipv4.c 2006-01-27 14:48:08.000000000 +0300 +@@ -31,22 +31,21 @@ struct ipv4_config ipv4_config; + + #ifdef CONFIG_SYSCTL + +-static + int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) + { +- int val = ipv4_devconf.forwarding; ++ int val = ve_ipv4_devconf.forwarding; + int ret; + + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + +- if (write && ipv4_devconf.forwarding != val) ++ if (write && ve_ipv4_devconf.forwarding != val) + inet_forward_change(); + + return ret; + } + +-static int ipv4_sysctl_forward_strategy(ctl_table *table, ++int ipv4_sysctl_forward_strategy(ctl_table *table, + int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, +diff -uprN linux-2.6.15.orig/net/ipv4/tcp.c linux-2.6.15-ve025stab014/net/ipv4/tcp.c +--- linux-2.6.15.orig/net/ipv4/tcp.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/tcp.c 2006-01-27 14:48:08.000000000 +0300 +@@ -248,6 +248,7 @@ + */ + + #include <linux/config.h> ++#include <linux/kmem_cache.h> + #include <linux/module.h> + #include <linux/types.h> + #include <linux/fcntl.h> +@@ -263,6 +264,9 @@ + #include <net/xfrm.h> + #include <net/ip.h> + ++#include <ub/ub_orphan.h> ++#include <ub/ub_net.h> ++#include <ub/ub_tcp.h> + + #include <asm/uaccess.h> + #include <asm/ioctls.h> +@@ -321,6 +325,7 @@ unsigned int tcp_poll(struct file *file, + unsigned int mask; + struct sock *sk = sock->sk; + struct tcp_sock *tp = tcp_sk(sk); ++ int check_send_space; + + poll_wait(file, sk->sk_sleep, wait); + if (sk->sk_state == TCP_LISTEN) +@@ -335,6 +340,21 @@ unsigned int tcp_poll(struct file *file, + if (sk->sk_err) + mask = POLLERR; + ++ check_send_space = 1; ++#ifdef CONFIG_USER_RESOURCE ++ if (!(sk->sk_shutdown & SEND_SHUTDOWN) && sock_has_ubc(sk)) { ++ unsigned long size; ++ size = MAX_TCP_HEADER + tp->mss_cache; ++ if (size > SOCK_MIN_UBCSPACE) ++ size = SOCK_MIN_UBCSPACE; ++ size = skb_charge_size(size); ++ if (ub_sock_makewres_tcp(sk, size)) { ++ check_send_space = 0; ++ ub_sock_sndqueueadd_tcp(sk, size); ++ } ++ } ++#endif ++ + /* + * POLLHUP is certainly not done right. But poll() doesn't + * have a notion of HUP in just one direction, and for a +@@ -378,7 +398,7 @@ unsigned int tcp_poll(struct file *file, + sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data)) + mask |= POLLIN | POLLRDNORM; + +- if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { ++ if (check_send_space && !(sk->sk_shutdown & SEND_SHUTDOWN)) { + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + mask |= POLLOUT | POLLWRNORM; + } else { /* send SIGIO later */ +@@ -528,16 +548,23 @@ static ssize_t do_tcp_sendpages(struct s + int copy, i, can_coalesce; + int offset = poffset % PAGE_SIZE; + int size = min_t(size_t, psize, PAGE_SIZE - offset); ++ unsigned long chargesize = 0; + + if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) { + new_segment: ++ chargesize = 0; + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + ++ chargesize = skb_charge_size(MAX_TCP_HEADER + ++ tp->mss_cache); ++ if (ub_sock_getwres_tcp(sk, chargesize) < 0) ++ goto wait_for_ubspace; + skb = sk_stream_alloc_pskb(sk, 0, 0, + sk->sk_allocation); + if (!skb) + goto wait_for_memory; ++ ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); + + skb_entail(sk, tp, skb); + copy = size_goal; +@@ -593,10 +620,14 @@ new_segment: + wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + wait_for_memory: ++ ub_sock_retwres_tcp(sk, chargesize, ++ skb_charge_size(MAX_TCP_HEADER + tp->mss_cache)); ++ chargesize = 0; ++wait_for_ubspace: + if (copied) + tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + +- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) ++ if ((err = sk_stream_wait_memory(sk, &timeo, chargesize)) != 0) + goto do_error; + + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); +@@ -699,6 +730,7 @@ int tcp_sendmsg(struct kiocb *iocb, stru + while (--iovlen >= 0) { + int seglen = iov->iov_len; + unsigned char __user *from = iov->iov_base; ++ unsigned long chargesize = 0; + + iov++; + +@@ -709,18 +741,26 @@ int tcp_sendmsg(struct kiocb *iocb, stru + + if (!sk->sk_send_head || + (copy = size_goal - skb->len) <= 0) { ++ unsigned long size; + + new_segment: + /* Allocate new segment. If the interface is SG, + * allocate skb fitting to single page. + */ ++ chargesize = 0; + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; +- +- skb = sk_stream_alloc_pskb(sk, select_size(sk, tp), +- 0, sk->sk_allocation); ++ size = select_size(sk, tp); ++ chargesize = skb_charge_size(MAX_TCP_HEADER + ++ size); ++ if (ub_sock_getwres_tcp(sk, chargesize) < 0) ++ goto wait_for_ubspace; ++ skb = sk_stream_alloc_pskb(sk, size, 0, ++ sk->sk_allocation); + if (!skb) + goto wait_for_memory; ++ ub_skb_set_charge(skb, sk, chargesize, ++ UB_TCPSNDBUF); + + /* + * Check whether we can use HW checksum. +@@ -768,6 +808,7 @@ new_segment: + } else if (page) { + if (off == PAGE_SIZE) { + put_page(page); ++ ub_sock_tcp_detachpage(sk); + TCP_PAGE(sk) = page = NULL; + off = 0; + } +@@ -781,6 +822,9 @@ new_segment: + goto wait_for_memory; + + if (!page) { ++ chargesize = PAGE_SIZE; ++ if (ub_sock_tcp_chargepage(sk) < 0) ++ goto wait_for_ubspace; + /* Allocate new cache page. */ + if (!(page = sk_stream_alloc_page(sk))) + goto wait_for_memory; +@@ -812,7 +856,8 @@ new_segment: + } else if (off + copy < PAGE_SIZE) { + get_page(page); + TCP_PAGE(sk) = page; +- } ++ } else ++ ub_sock_tcp_detachpage(sk); + } + + TCP_OFF(sk) = off + copy; +@@ -843,10 +888,15 @@ new_segment: + wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + wait_for_memory: ++ ub_sock_retwres_tcp(sk, chargesize, ++ skb_charge_size(MAX_TCP_HEADER+tp->mss_cache)); ++ chargesize = 0; ++wait_for_ubspace: + if (copied) + tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + +- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) ++ if ((err = sk_stream_wait_memory(sk, &timeo, ++ chargesize)) != 0) + goto do_error; + + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); +@@ -944,7 +994,18 @@ static void cleanup_rbuf(struct sock *sk + #if TCP_DEBUG + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + +- BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); ++ if (!(skb==NULL || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq))) { ++ printk("KERNEL: assertion: skb==NULL || " ++ "before(tp->copied_seq, skb->end_seq)\n"); ++ printk("VE%u pid %d comm %.16s\n", ++ (get_exec_env() ? VEID(get_exec_env()) : 0), ++ current->pid, current->comm); ++ printk("copied=%d, copied_seq=%d, rcv_nxt=%d\n", copied, ++ tp->copied_seq, tp->rcv_nxt); ++ printk("skb->len=%d, skb->seq=%d, skb->end_seq=%d\n", ++ skb->len, TCP_SKB_CB(skb)->seq, ++ TCP_SKB_CB(skb)->end_seq); ++ } + #endif + + if (inet_csk_ack_scheduled(sk)) { +@@ -1168,7 +1229,22 @@ int tcp_recvmsg(struct kiocb *iocb, stru + goto found_ok_skb; + if (skb->h.th->fin) + goto found_fin_ok; +- BUG_TRAP(flags & MSG_PEEK); ++ if (!(flags & MSG_PEEK)) { ++ printk("KERNEL: assertion: flags&MSG_PEEK\n"); ++ printk("VE%u pid %d comm %.16s\n", ++ (get_exec_env() ? ++ VEID(get_exec_env()) : 0), ++ current->pid, current->comm); ++ printk("flags=0x%x, len=%d, copied_seq=%d, " ++ "rcv_nxt=%d\n", flags, len, ++ tp->copied_seq, tp->rcv_nxt); ++ printk("skb->len=%d, *seq=%d, skb->seq=%d, " ++ "skb->end_seq=%d, offset=%d\n", ++ skb->len, *seq, ++ TCP_SKB_CB(skb)->seq, ++ TCP_SKB_CB(skb)->end_seq, ++ offset); ++ } + skb = skb->next; + } while (skb != (struct sk_buff *)&sk->sk_receive_queue); + +@@ -1231,8 +1307,18 @@ int tcp_recvmsg(struct kiocb *iocb, stru + + tp->ucopy.len = len; + +- BUG_TRAP(tp->copied_seq == tp->rcv_nxt || +- (flags & (MSG_PEEK | MSG_TRUNC))); ++ if (!(tp->copied_seq == tp->rcv_nxt || ++ (flags&(MSG_PEEK|MSG_TRUNC)))) { ++ printk("KERNEL: assertion: tp->copied_seq == " ++ "tp->rcv_nxt || ...\n"); ++ printk("VE%u pid %d comm %.16s\n", ++ (get_exec_env() ? ++ VEID(get_exec_env()) : 0), ++ current->pid, current->comm); ++ printk("flags=0x%x, len=%d, copied_seq=%d, " ++ "rcv_nxt=%d\n", flags, len, ++ tp->copied_seq, tp->rcv_nxt); ++ } + + /* Ugly... If prequeue is not empty, we have to + * process it before releasing socket, otherwise +@@ -1583,7 +1669,7 @@ adjudge_to_death: + if (tmo > TCP_TIMEWAIT_LEN) { + inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk)); + } else { +- atomic_inc(sk->sk_prot->orphan_count); ++ ub_inc_orphan_count(sk); + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto out; + } +@@ -1591,9 +1677,7 @@ adjudge_to_death: + } + if (sk->sk_state != TCP_CLOSE) { + sk_stream_mem_reclaim(sk); +- if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans || +- (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && +- atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { ++ if (ub_too_many_orphans(sk, ub_get_orphan_count(sk))) { + if (net_ratelimit()) + printk(KERN_INFO "TCP: too many of orphaned " + "sockets\n"); +@@ -1602,7 +1686,7 @@ adjudge_to_death: + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); + } + } +- atomic_inc(sk->sk_prot->orphan_count); ++ ub_inc_orphan_count(sk); + + if (sk->sk_state == TCP_CLOSE) + inet_csk_destroy_sock(sk); +@@ -2051,7 +2135,7 @@ void __init tcp_init(void) + tcp_hashinfo.bind_bucket_cachep = + kmem_cache_create("tcp_bind_bucket", + sizeof(struct inet_bind_bucket), 0, +- SLAB_HWCACHE_ALIGN, NULL, NULL); ++ SLAB_HWCACHE_ALIGN | SLAB_UBC, NULL, NULL); + if (!tcp_hashinfo.bind_bucket_cachep) + panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); + +diff -uprN linux-2.6.15.orig/net/ipv4/tcp_input.c linux-2.6.15-ve025stab014/net/ipv4/tcp_input.c +--- linux-2.6.15.orig/net/ipv4/tcp_input.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/tcp_input.c 2006-01-27 14:48:06.000000000 +0300 +@@ -72,6 +72,8 @@ + #include <linux/ipsec.h> + #include <asm/unaligned.h> + ++#include <ub/ub_tcp.h> ++ + int sysctl_tcp_timestamps = 1; + int sysctl_tcp_window_scaling = 1; + int sysctl_tcp_sack = 1; +@@ -252,7 +254,7 @@ static inline void tcp_grow_window(struc + /* Check #1 */ + if (tp->rcv_ssthresh < tp->window_clamp && + (int)tp->rcv_ssthresh < tcp_space(sk) && +- !tcp_memory_pressure) { ++ ub_tcp_rmem_allows_expand(sk)) { + int incr; + + /* Check #2. Increase window, if skb with such overhead +@@ -321,6 +323,8 @@ static void tcp_init_buffer_space(struct + + tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); + tp->snd_cwnd_stamp = tcp_time_stamp; ++ ++ ub_tcp_update_maxadvmss(sk); + } + + /* 5. Recalculate window clamp after socket hit its memory bounds. */ +@@ -332,7 +336,7 @@ static void tcp_clamp_window(struct sock + + if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && +- !tcp_memory_pressure && ++ !ub_tcp_memory_pressure(sk) && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), + sysctl_tcp_rmem[2]); +@@ -3077,7 +3081,7 @@ queue_and_out: + !sk_stream_rmem_schedule(sk, skb))) { + if (tcp_prune_queue(sk) < 0 || + !sk_stream_rmem_schedule(sk, skb)) +- goto drop; ++ goto drop_part; + } + sk_stream_set_owner_r(skb, sk); + __skb_queue_tail(&sk->sk_receive_queue, skb); +@@ -3121,6 +3125,12 @@ out_of_window: + drop: + __kfree_skb(skb); + return; ++ ++drop_part: ++ if (after(tp->copied_seq, tp->rcv_nxt)) ++ tp->rcv_nxt = tp->copied_seq; ++ __kfree_skb(skb); ++ return; + } + + /* Out of window. F.e. zero window probe. */ +@@ -3292,6 +3302,10 @@ tcp_collapse(struct sock *sk, struct sk_ + nskb = alloc_skb(copy+header, GFP_ATOMIC); + if (!nskb) + return; ++ if (ub_tcprcvbuf_charge_forced(skb->sk, nskb) < 0) { ++ kfree_skb(nskb); ++ return; ++ } + skb_reserve(nskb, header); + memcpy(nskb->head, skb->head, header); + nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head); +@@ -3388,7 +3402,7 @@ static int tcp_prune_queue(struct sock * + + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + tcp_clamp_window(sk, tp); +- else if (tcp_memory_pressure) ++ else if (ub_tcp_memory_pressure(sk)) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + + tcp_collapse_ofo_queue(sk); +@@ -3464,7 +3478,7 @@ static inline int tcp_should_expand_sndb + return 0; + + /* If we are under global TCP memory pressure, do not expand. */ +- if (tcp_memory_pressure) ++ if (ub_tcp_memory_pressure(sk)) + return 0; + + /* If we are under soft global TCP memory pressure, do not expand. */ +@@ -3858,6 +3872,10 @@ int tcp_rcv_established(struct sock *sk, + + if ((int)skb->truesize > sk->sk_forward_alloc) + goto step5; ++ /* This is OK not to try to free memory here. ++ * Do this below on slow path. Den */ ++ if (ub_tcprcvbuf_charge(sk, skb) < 0) ++ goto step5; + + NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS); + +diff -uprN linux-2.6.15.orig/net/ipv4/tcp_ipv4.c linux-2.6.15-ve025stab014/net/ipv4/tcp_ipv4.c +--- linux-2.6.15.orig/net/ipv4/tcp_ipv4.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/tcp_ipv4.c 2006-01-27 14:48:08.000000000 +0300 +@@ -71,6 +71,8 @@ + #include <net/inet_common.h> + #include <net/xfrm.h> + ++#include <ub/ub_tcp.h> ++ + #include <linux/inet.h> + #include <linux/ipv6.h> + #include <linux/stddef.h> +@@ -128,12 +130,16 @@ static int __tcp_v4_check_established(st + int dif = sk->sk_bound_dev_if; + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); +- unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); +- struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash); ++ unsigned int hash; ++ struct inet_ehash_bucket *head; + struct sock *sk2; + const struct hlist_node *node; + struct inet_timewait_sock *tw; ++ struct ve_struct *env; + ++ env = VE_OWNER_SK(sk); ++ hash = inet_ehashfn(daddr, lport, saddr, inet->dport, VEID(env)); ++ head = inet_ehash_bucket(&tcp_hashinfo, hash); + prefetch(head->chain.first); + write_lock(&head->lock); + +@@ -141,7 +147,8 @@ static int __tcp_v4_check_established(st + sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { + tw = inet_twsk(sk2); + +- if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { ++ if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ++ ports, dif, env)) { + const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); + struct tcp_sock *tp = tcp_sk(sk); + +@@ -178,7 +185,8 @@ static int __tcp_v4_check_established(st + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { +- if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) ++ if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ++ ports, dif, env)) + goto not_unique; + } + +@@ -228,7 +236,9 @@ static inline int tcp_v4_hash_connect(st + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; ++ struct ve_struct *env; + ++ env = VE_OWNER_SK(sk); + if (!snum) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; +@@ -243,7 +253,9 @@ static inline int tcp_v4_hash_connect(st + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; +- head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; ++ head = &tcp_hashinfo.bhash[inet_bhashfn(port, ++ tcp_hashinfo.bhash_size, ++ VEID(env))]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, +@@ -251,6 +263,8 @@ static inline int tcp_v4_hash_connect(st + * unique enough. + */ + inet_bind_bucket_for_each(tb, node, &head->chain) { ++ if (!ve_accessible_strict(VE_OWNER_TB(tb),env)) ++ continue; + if (tb->port == port) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) +@@ -263,7 +277,7 @@ static inline int tcp_v4_hash_connect(st + } + } + +- tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); ++ tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port, env); + if (!tb) { + spin_unlock(&head->lock); + break; +@@ -298,7 +312,8 @@ ok: + goto out; + } + +- head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; ++ head = &tcp_hashinfo.bhash[inet_bhashfn(snum, ++ tcp_hashinfo.bhash_size, VEID(env))]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { +@@ -1137,12 +1152,15 @@ static int tcp_v4_checksum_init(struct s + */ + int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) + { ++ struct user_beancounter *ub; ++ ++ ub = set_exec_ub(sock_bc(sk)->ub); + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + TCP_CHECK_TIMER(sk); + if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) + goto reset; + TCP_CHECK_TIMER(sk); +- return 0; ++ goto restore_context; + } + + if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb)) +@@ -1156,7 +1174,7 @@ int tcp_v4_do_rcv(struct sock *sk, struc + if (nsk != sk) { + if (tcp_child_process(sk, nsk, skb)) + goto reset; +- return 0; ++ goto restore_context; + } + } + +@@ -1164,6 +1182,9 @@ int tcp_v4_do_rcv(struct sock *sk, struc + if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) + goto reset; + TCP_CHECK_TIMER(sk); ++ ++restore_context: ++ (void)set_exec_ub(ub); + return 0; + + reset: +@@ -1175,7 +1196,7 @@ discard: + * might be destroyed here. This current version compiles correctly, + * but you have been warned. + */ +- return 0; ++ goto restore_context; + + csum_err: + TCP_INC_STATS_BH(TCP_MIB_INERRS); +@@ -1468,6 +1489,8 @@ int tcp_v4_destroy_sock(struct sock *sk) + * If sendmsg cached page exists, toss it. + */ + if (sk->sk_sndmsg_page) { ++ /* queue is empty, uncharge */ ++ ub_sock_tcp_detachpage(sk); + __free_page(sk->sk_sndmsg_page); + sk->sk_sndmsg_page = NULL; + } +@@ -1488,10 +1511,18 @@ static inline struct inet_timewait_sock + list_entry(head->first, struct inet_timewait_sock, tw_node); + } + +-static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) ++static inline struct inet_timewait_sock * ++ tw_next(struct inet_timewait_sock *tw, envid_t veid) + { +- return tw->tw_node.next ? +- hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; ++ while (1) { ++ if (tw->tw_node.next == NULL) ++ return NULL; ++ tw = hlist_entry(tw->tw_node.next, typeof(*tw), tw_node); ++ if (!ve_accessible_veid(tw->tw_owner_env, veid)) ++ continue; ++ return tw; ++ } ++ return NULL; /* make compiler happy */ + } + + static void *listening_get_next(struct seq_file *seq, void *cur) +@@ -1500,7 +1531,9 @@ static void *listening_get_next(struct s + struct hlist_node *node; + struct sock *sk = cur; + struct tcp_iter_state* st = seq->private; ++ struct ve_struct *ve; + ++ ve = get_exec_env(); + if (!sk) { + st->bucket = 0; + sk = sk_head(&tcp_hashinfo.listening_hash[0]); +@@ -1540,6 +1573,8 @@ get_req: + } + get_sk: + sk_for_each_from(sk, node) { ++ if (!ve_accessible(VE_OWNER_SK(sk), ve)) ++ continue; + if (sk->sk_family == st->family) { + cur = sk; + goto out; +@@ -1580,7 +1615,9 @@ static void *established_get_first(struc + { + struct tcp_iter_state* st = seq->private; + void *rc = NULL; ++ struct ve_struct *ve; + ++ ve = get_exec_env(); + for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { + struct sock *sk; + struct hlist_node *node; +@@ -1591,6 +1628,8 @@ static void *established_get_first(struc + + read_lock(&tcp_hashinfo.ehash[st->bucket].lock); + sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { ++ if (!ve_accessible(VE_OWNER_SK(sk), ve)) ++ continue; + if (sk->sk_family != st->family) { + continue; + } +@@ -1600,6 +1639,8 @@ static void *established_get_first(struc + st->state = TCP_SEQ_STATE_TIME_WAIT; + inet_twsk_for_each(tw, node, + &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) { ++ if (!ve_accessible_veid(tw->tw_owner_env, VEID(ve))) ++ continue; + if (tw->tw_family != st->family) { + continue; + } +@@ -1619,16 +1660,17 @@ static void *established_get_next(struct + struct inet_timewait_sock *tw; + struct hlist_node *node; + struct tcp_iter_state* st = seq->private; ++ struct ve_struct *ve; + ++ ve = get_exec_env(); + ++st->num; + + if (st->state == TCP_SEQ_STATE_TIME_WAIT) { + tw = cur; +- tw = tw_next(tw); ++ tw = tw_next(tw, VEID(ve)); + get_tw: +- while (tw && tw->tw_family != st->family) { +- tw = tw_next(tw); +- } ++ while (tw && tw->tw_family != st->family) ++ tw = tw_next(tw, VEID(ve)); + if (tw) { + cur = tw; + goto out; +@@ -1650,6 +1692,8 @@ get_tw: + sk = sk_next(sk); + + sk_for_each_from(sk, node) { ++ if (!ve_accessible(VE_OWNER_SK(sk), ve)) ++ continue; + if (sk->sk_family == st->family) + goto found; + } +@@ -1801,7 +1845,12 @@ int tcp_proc_register(struct tcp_seq_afi + afinfo->seq_fops->llseek = seq_lseek; + afinfo->seq_fops->release = seq_release_private; + +- p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); ++ if (*(afinfo->name) == 'n') ++ p = proc_glob_fops_create(afinfo->name, S_IRUGO, ++ afinfo->seq_fops); ++ else ++ p = proc_net_fops_create(afinfo->name, S_IRUGO, ++ afinfo->seq_fops); + if (p) + p->data = afinfo; + else +@@ -1813,7 +1862,10 @@ void tcp_proc_unregister(struct tcp_seq_ + { + if (!afinfo) + return; +- proc_net_remove(afinfo->name); ++ if (*(afinfo->name) == 'n') ++ remove_proc_glob_entry(afinfo->name, NULL); ++ else ++ proc_net_remove(afinfo->name); + memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); + } + +@@ -1943,7 +1995,7 @@ out: + static struct file_operations tcp4_seq_fops; + static struct tcp_seq_afinfo tcp4_seq_afinfo = { + .owner = THIS_MODULE, +- .name = "tcp", ++ .name = "net/tcp", + .family = AF_INET, + .seq_show = tcp4_seq_show, + .seq_fops = &tcp4_seq_fops, +@@ -2010,8 +2062,87 @@ void __init tcp_v4_init(struct net_proto + tcp_socket->sk->sk_prot->unhash(tcp_socket->sk); + } + ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++static void tcp_kill_ve_onesk(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ /* Check the assumed state of the socket. */ ++ if (!sock_flag(sk, SOCK_DEAD)) { ++ static int printed; ++invalid: ++ if (!printed) ++ printk(KERN_DEBUG "Killing sk: dead %d, state %d, " ++ "wrseq %u unseq %u, wrqu %d.\n", ++ sock_flag(sk, SOCK_DEAD), sk->sk_state, ++ tp->write_seq, tp->snd_una, ++ !skb_queue_empty(&sk->sk_write_queue)); ++ printed = 1; ++ return; ++ } ++ ++ tcp_send_active_reset(sk, GFP_ATOMIC); ++ switch (sk->sk_state) { ++ case TCP_FIN_WAIT1: ++ case TCP_CLOSING: ++ /* In these 2 states the peer may want us to retransmit ++ * some data and/or FIN. Entering "resetting mode" ++ * instead. ++ */ ++ tcp_time_wait(sk, TCP_CLOSE, 0); ++ break; ++ case TCP_FIN_WAIT2: ++ /* By some reason the socket may stay in this state ++ * without turning into a TW bucket. Fix it. ++ */ ++ tcp_time_wait(sk, TCP_FIN_WAIT2, 0); ++ break; ++ case TCP_LAST_ACK: ++ /* Just jump into CLOSED state. */ ++ tcp_done(sk); ++ break; ++ default: ++ /* The socket must be already close()d. */ ++ goto invalid; ++ } ++} ++ ++void tcp_v4_kill_ve_sockets(struct ve_struct *envid) ++{ ++ struct inet_ehash_bucket *head; ++ int i; ++ ++ /* alive */ ++ local_bh_disable(); ++ head = tcp_hashinfo.ehash; ++ for (i = 0; i < tcp_hashinfo.ehash_size; i++) { ++ struct sock *sk; ++ struct hlist_node *node; ++more_work: ++ write_lock(&head[i].lock); ++ sk_for_each(sk, node, &head[i].chain) { ++ if (ve_accessible_strict(VE_OWNER_SK(sk), envid)) { ++ sock_hold(sk); ++ write_unlock(&head[i].lock); ++ ++ bh_lock_sock(sk); ++ /* sk might have disappeared from the hash before ++ * we got the lock */ ++ if (sk->sk_state != TCP_CLOSE) ++ tcp_kill_ve_onesk(sk); ++ bh_unlock_sock(sk); ++ sock_put(sk); ++ goto more_work; ++ } ++ } ++ write_unlock(&head[i].lock); ++ } ++ local_bh_enable(); ++} ++EXPORT_SYMBOL(tcp_v4_kill_ve_sockets); ++#endif ++ + EXPORT_SYMBOL(ipv4_specific); +-EXPORT_SYMBOL(inet_bind_bucket_create); + EXPORT_SYMBOL(tcp_hashinfo); + EXPORT_SYMBOL(tcp_prot); + EXPORT_SYMBOL(tcp_unhash); +diff -uprN linux-2.6.15.orig/net/ipv4/tcp_minisocks.c linux-2.6.15-ve025stab014/net/ipv4/tcp_minisocks.c +--- linux-2.6.15.orig/net/ipv4/tcp_minisocks.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/tcp_minisocks.c 2006-01-27 14:48:08.000000000 +0300 +@@ -29,6 +29,8 @@ + #include <net/inet_common.h> + #include <net/xfrm.h> + ++#include <ub/ub_net.h> ++ + #ifdef CONFIG_SYSCTL + #define SYNC_INIT 0 /* let the user enable it */ + #else +@@ -305,6 +307,8 @@ void tcp_time_wait(struct sock *sk, int + tw->tw_ipv6only = np->ipv6only; + } + #endif ++ tw->tw_owner_env = VEID(VE_OWNER_SK(sk)); ++ + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); + +@@ -353,6 +357,8 @@ struct sock *tcp_create_openreq_child(st + struct tcp_sock *newtp; + + /* Now setup tcp_sock */ ++ SET_VE_OWNER_SK(newsk, VE_OWNER_SK(sk)); ++ + newtp = tcp_sk(newsk); + newtp->pred_flags = 0; + newtp->rcv_nxt = treq->rcv_isn + 1; +diff -uprN linux-2.6.15.orig/net/ipv4/tcp_output.c linux-2.6.15-ve025stab014/net/ipv4/tcp_output.c +--- linux-2.6.15.orig/net/ipv4/tcp_output.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/tcp_output.c 2006-01-27 14:48:06.000000000 +0300 +@@ -42,6 +42,9 @@ + #include <linux/module.h> + #include <linux/smp_lock.h> + ++#include <ub/ub_net.h> ++#include <ub/ub_tcp.h> ++ + /* People can turn this off for buggy TCP's found in printers etc. */ + int sysctl_tcp_retrans_collapse = 1; + +@@ -459,15 +462,23 @@ int tcp_fragment(struct sock *sk, struct + if (nsize < 0) + nsize = 0; + +- if (skb_cloned(skb) && +- skb_is_nonlinear(skb) && +- pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) +- return -ENOMEM; ++ if (skb_cloned(skb) && skb_is_nonlinear(skb)) { ++ unsigned long chargesize; ++ chargesize = skb_bc(skb)->charged; ++ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) ++ return -ENOMEM; ++ ub_sock_retwres_tcp(sk, chargesize, chargesize); ++ ub_tcpsndbuf_charge_forced(sk, skb); ++ } + + /* Get a new skb... force flag on. */ + buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); + if (buff == NULL) + return -ENOMEM; /* We'll just try again later. */ ++ if (ub_tcpsndbuf_charge(sk, buff) < 0) { ++ kfree_skb(buff); ++ return -ENOMEM; ++ } + sk_charge_skb(sk, buff); + + /* Correct the sequence numbers. */ +@@ -1207,7 +1218,7 @@ u32 __tcp_select_window(struct sock *sk) + if (free_space < full_space/2) { + icsk->icsk_ack.quick = 0; + +- if (tcp_memory_pressure) ++ if (ub_tcp_shrink_rcvbuf(sk)) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); + + if (free_space < mss) +@@ -1634,6 +1645,7 @@ void tcp_send_fin(struct sock *sk) + break; + yield(); + } ++ ub_tcpsndbuf_charge_forced(sk, skb); + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); +@@ -1703,6 +1715,10 @@ int tcp_send_synack(struct sock *sk) + struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); + if (nskb == NULL) + return -ENOMEM; ++ if (ub_tcpsndbuf_charge(sk, skb) < 0) { ++ kfree_skb(nskb); ++ return -ENOMEM; ++ } + __skb_unlink(skb, &sk->sk_write_queue); + skb_header_release(nskb); + __skb_queue_head(&sk->sk_write_queue, nskb); +@@ -1854,6 +1870,10 @@ int tcp_connect(struct sock *sk) + buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); + if (unlikely(buff == NULL)) + return -ENOBUFS; ++ if (ub_tcpsndbuf_charge(sk, buff) < 0) { ++ kfree_skb(buff); ++ return -ENOBUFS; ++ } + + /* Reserve space for headers. */ + skb_reserve(buff, MAX_TCP_HEADER); +diff -uprN linux-2.6.15.orig/net/ipv4/tcp_timer.c linux-2.6.15-ve025stab014/net/ipv4/tcp_timer.c +--- linux-2.6.15.orig/net/ipv4/tcp_timer.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/tcp_timer.c 2006-01-27 14:48:08.000000000 +0300 +@@ -22,6 +22,8 @@ + + #include <linux/module.h> + #include <net/tcp.h> ++#include <ub/ub_orphan.h> ++#include <ub/ub_tcp.h> + + int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; + int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; +@@ -67,7 +69,7 @@ static void tcp_write_err(struct sock *s + static int tcp_out_of_resources(struct sock *sk, int do_reset) + { + struct tcp_sock *tp = tcp_sk(sk); +- int orphans = atomic_read(&tcp_orphan_count); ++ int orphans = ub_get_orphan_count(sk); + + /* If peer does not open window for long time, or did not transmit + * anything for long time, penalize it. */ +@@ -78,9 +80,7 @@ static int tcp_out_of_resources(struct s + if (sk->sk_err_soft) + orphans <<= 1; + +- if (orphans >= sysctl_tcp_max_orphans || +- (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && +- atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { ++ if (ub_too_many_orphans(sk, orphans)) { + if (net_ratelimit()) + printk(KERN_INFO "Out of socket memory\n"); + +@@ -173,9 +173,12 @@ static int tcp_write_timeout(struct sock + static void tcp_delack_timer(unsigned long data) + { + struct sock *sk = (struct sock*)data; ++ struct ve_struct *env; + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + ++ env = set_exec_env(VE_OWNER_SK(sk)); ++ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ +@@ -224,11 +227,12 @@ static void tcp_delack_timer(unsigned lo + TCP_CHECK_TIMER(sk); + + out: +- if (tcp_memory_pressure) ++ if (ub_tcp_memory_pressure(sk)) + sk_stream_mem_reclaim(sk); + out_unlock: + bh_unlock_sock(sk); + sock_put(sk); ++ (void)set_exec_env(env); + } + + static void tcp_probe_timer(struct sock *sk) +@@ -283,8 +287,11 @@ static void tcp_probe_timer(struct sock + static void tcp_retransmit_timer(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); ++ struct ve_struct *env; + struct inet_connection_sock *icsk = inet_csk(sk); + ++ env = set_exec_env(VE_OWNER_SK(sk)); ++ + if (!tp->packets_out) + goto out; + +@@ -381,15 +388,19 @@ out_reset_timer: + if (icsk->icsk_retransmits > sysctl_tcp_retries1) + __sk_dst_reset(sk); + +-out:; ++out: ++ (void)set_exec_env(env); + } + + static void tcp_write_timer(unsigned long data) + { + struct sock *sk = (struct sock*)data; ++ struct ve_struct *env; + struct inet_connection_sock *icsk = inet_csk(sk); + int event; + ++ env = set_exec_env(VE_OWNER_SK(sk)); ++ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later */ +@@ -423,6 +434,7 @@ out: + out_unlock: + bh_unlock_sock(sk); + sock_put(sk); ++ (void)set_exec_env(env); + } + + /* +@@ -450,10 +462,13 @@ void tcp_set_keepalive(struct sock *sk, + static void tcp_keepalive_timer (unsigned long data) + { + struct sock *sk = (struct sock *) data; ++ struct ve_struct *env; + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + __u32 elapsed; + ++ env = set_exec_env(VE_OWNER_SK(sk)); ++ + /* Only process if socket is not in use. */ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { +@@ -525,4 +540,5 @@ death: + out: + bh_unlock_sock(sk); + sock_put(sk); ++ (void)set_exec_env(env); + } +diff -uprN linux-2.6.15.orig/net/ipv4/udp.c linux-2.6.15-ve025stab014/net/ipv4/udp.c +--- linux-2.6.15.orig/net/ipv4/udp.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv4/udp.c 2006-01-27 14:48:08.000000000 +0300 +@@ -126,7 +126,9 @@ static int udp_v4_get_port(struct sock * + struct hlist_node *node; + struct sock *sk2; + struct inet_sock *inet = inet_sk(sk); ++ struct ve_struct *env; + ++ env = VE_OWNER_SK(sk); + write_lock_bh(&udp_hash_lock); + if (snum == 0) { + int best_size_so_far, best, result, i; +@@ -140,7 +142,7 @@ static int udp_v4_get_port(struct sock * + struct hlist_head *list; + int size; + +- list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)]; ++ list = &udp_hash[udp_hashfn(result, VEID(env))]; + if (hlist_empty(list)) { + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + +@@ -162,7 +164,7 @@ static int udp_v4_get_port(struct sock * + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & + (UDP_HTABLE_SIZE - 1)); +- if (!udp_lport_inuse(result)) ++ if (!udp_lport_inuse(result, env)) + break; + } + if (i >= (1 << 16) / UDP_HTABLE_SIZE) +@@ -171,11 +173,12 @@ gotit: + udp_port_rover = snum = result; + } else { + sk_for_each(sk2, node, +- &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) { ++ &udp_hash[udp_hashfn(snum, VEID(env))]) { + struct inet_sock *inet2 = inet_sk(sk2); + + if (inet2->num == snum && + sk2 != sk && ++ ve_accessible_strict(VE_OWNER_SK(sk2), env) && + !ipv6_only_sock(sk2) && + (!sk2->sk_bound_dev_if || + !sk->sk_bound_dev_if || +@@ -189,7 +192,7 @@ gotit: + } + inet->num = snum; + if (sk_unhashed(sk)) { +- struct hlist_head *h = &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]; ++ struct hlist_head *h = &udp_hash[udp_hashfn(snum, VEID(env))]; + + sk_add_node(sk, h); + sock_prot_inc_use(sk->sk_prot); +@@ -227,11 +230,15 @@ static struct sock *udp_v4_lookup_longwa + struct hlist_node *node; + unsigned short hnum = ntohs(dport); + int badness = -1; ++ struct ve_struct *env; + +- sk_for_each(sk, node, &udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]) { ++ env = get_exec_env(); ++ sk_for_each(sk, node, &udp_hash[udp_hashfn(hnum, VEID(env))]) { + struct inet_sock *inet = inet_sk(sk); + +- if (inet->num == hnum && !ipv6_only_sock(sk)) { ++ if (inet->num == hnum && ++ ve_accessible_strict(VE_OWNER_SK(sk), env) && ++ !ipv6_only_sock(sk)) { + int score = (sk->sk_family == PF_INET ? 1 : 0); + if (inet->rcv_saddr) { + if (inet->rcv_saddr != daddr) +@@ -1060,7 +1067,8 @@ static int udp_v4_mcast_deliver(struct s + int dif; + + read_lock(&udp_hash_lock); +- sk = sk_head(&udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); ++ sk = sk_head(&udp_hash[udp_hashfn(ntohs(uh->dest), ++ VEID(VE_OWNER_SKB(skb)))]); + dif = skb->dev->ifindex; + sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); + if (sk) { +@@ -1379,10 +1387,14 @@ static struct sock *udp_get_first(struct + { + struct sock *sk; + struct udp_iter_state *state = seq->private; ++ struct ve_struct *env; + ++ env = get_exec_env(); + for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { + struct hlist_node *node; + sk_for_each(sk, node, &udp_hash[state->bucket]) { ++ if (!ve_accessible(VE_OWNER_SK(sk), env)) ++ continue; + if (sk->sk_family == state->family) + goto found; + } +@@ -1399,8 +1411,13 @@ static struct sock *udp_get_next(struct + do { + sk = sk_next(sk); + try_again: +- ; +- } while (sk && sk->sk_family != state->family); ++ if (!sk) ++ break; ++ if (sk->sk_family != state->family) ++ continue; ++ if (ve_accessible(VE_OWNER_SK(sk), get_exec_env())) ++ break; ++ } while (1); + + if (!sk && ++state->bucket < UDP_HTABLE_SIZE) { + sk = sk_head(&udp_hash[state->bucket]); +@@ -1486,7 +1503,12 @@ int udp_proc_register(struct udp_seq_afi + afinfo->seq_fops->llseek = seq_lseek; + afinfo->seq_fops->release = seq_release_private; + +- p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); ++ if (*(afinfo->name) == 'n') ++ p = proc_glob_fops_create(afinfo->name, S_IRUGO, ++ afinfo->seq_fops); ++ else ++ p = proc_net_fops_create(afinfo->name, S_IRUGO, ++ afinfo->seq_fops); + if (p) + p->data = afinfo; + else +@@ -1498,7 +1520,10 @@ void udp_proc_unregister(struct udp_seq_ + { + if (!afinfo) + return; +- proc_net_remove(afinfo->name); ++ if (*(afinfo->name) == 'n') ++ remove_proc_glob_entry(afinfo->name, NULL); ++ else ++ proc_net_remove(afinfo->name); + memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); + } + +@@ -1541,7 +1566,7 @@ static int udp4_seq_show(struct seq_file + static struct file_operations udp4_seq_fops; + static struct udp_seq_afinfo udp4_seq_afinfo = { + .owner = THIS_MODULE, +- .name = "udp", ++ .name = "net/udp", + .family = AF_INET, + .seq_show = udp4_seq_show, + .seq_fops = &udp4_seq_fops, +diff -uprN linux-2.6.15.orig/net/ipv6/addrconf.c linux-2.6.15-ve025stab014/net/ipv6/addrconf.c +--- linux-2.6.15.orig/net/ipv6/addrconf.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv6/addrconf.c 2006-01-27 14:48:08.000000000 +0300 +@@ -2152,6 +2152,10 @@ static int addrconf_notify(struct notifi + struct inet6_dev *idev = __in6_dev_get(dev); + int run_pending = 0; + ++ /* not virtualized yet */ ++ if (!ve_is_super(get_exec_env())) ++ return NOTIFY_OK; ++ + switch(event) { + case NETDEV_UP: + case NETDEV_CHANGE: +diff -uprN linux-2.6.15.orig/net/ipv6/inet6_hashtables.c linux-2.6.15-ve025stab014/net/ipv6/inet6_hashtables.c +--- linux-2.6.15.orig/net/ipv6/inet6_hashtables.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv6/inet6_hashtables.c 2006-01-27 14:48:08.000000000 +0300 +@@ -31,7 +31,7 @@ struct sock *inet6_lookup_listener(struc + int score, hiscore = 0; + + read_lock(&hashinfo->lhash_lock); +- sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) { ++ sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum, 0)]) { + if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) { + const struct ipv6_pinfo *np = inet6_sk(sk); + +diff -uprN linux-2.6.15.orig/net/ipv6/tcp_ipv6.c linux-2.6.15-ve025stab014/net/ipv6/tcp_ipv6.c +--- linux-2.6.15.orig/net/ipv6/tcp_ipv6.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv6/tcp_ipv6.c 2006-01-27 14:48:08.000000000 +0300 +@@ -117,7 +117,7 @@ static int tcp_v6_get_port(struct sock * + int rover = net_random() % (high - low) + low; + + do { +- head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)]; ++ head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size, 0)]; + spin_lock(&head->lock); + inet_bind_bucket_for_each(tb, node, &head->chain) + if (tb->port == rover) +@@ -142,7 +142,7 @@ static int tcp_v6_get_port(struct sock * + /* OK, here is the one we will use. */ + snum = rover; + } else { +- head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; ++ head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size, 0)]; + spin_lock(&head->lock); + inet_bind_bucket_for_each(tb, node, &head->chain) + if (tb->port == snum) +@@ -164,7 +164,7 @@ tb_found: + tb_not_found: + ret = 1; + if (tb == NULL) { +- tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum); ++ tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum, NULL); + if (tb == NULL) + goto fail_unlock; + } +@@ -419,7 +419,7 @@ static int tcp_v6_hash_connect(struct so + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; +- head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; ++ head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size, 0)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, +@@ -439,7 +439,7 @@ static int tcp_v6_hash_connect(struct so + } + } + +- tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); ++ tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port, NULL); + if (!tb) { + spin_unlock(&head->lock); + break; +@@ -474,7 +474,7 @@ ok: + goto out; + } + +- head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; ++ head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size, 0)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + +diff -uprN linux-2.6.15.orig/net/ipv6/udp.c linux-2.6.15-ve025stab014/net/ipv6/udp.c +--- linux-2.6.15.orig/net/ipv6/udp.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/ipv6/udp.c 2006-01-27 14:48:08.000000000 +0300 +@@ -68,7 +68,9 @@ static int udp_v6_get_port(struct sock * + { + struct sock *sk2; + struct hlist_node *node; ++ struct ve_struct *env; + ++ env = VE_OWNER_SK(sk); + write_lock_bh(&udp_hash_lock); + if (snum == 0) { + int best_size_so_far, best, result, i; +@@ -82,7 +84,7 @@ static int udp_v6_get_port(struct sock * + int size; + struct hlist_head *list; + +- list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)]; ++ list = &udp_hash[udp_hashfn(result, VEID(env))]; + if (hlist_empty(list)) { + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + +@@ -104,7 +106,7 @@ static int udp_v6_get_port(struct sock * + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & + (UDP_HTABLE_SIZE - 1)); +- if (!udp_lport_inuse(result)) ++ if (!udp_lport_inuse(result, env)) + break; + } + if (i >= (1 << 16) / UDP_HTABLE_SIZE) +@@ -113,9 +115,10 @@ gotit: + udp_port_rover = snum = result; + } else { + sk_for_each(sk2, node, +- &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) { ++ &udp_hash[udp_hashfn(snum, VEID(env))]) { + if (inet_sk(sk2)->num == snum && + sk2 != sk && ++ ve_accessible_strict(VE_OWNER_SK(sk2), env) && + (!sk2->sk_bound_dev_if || + !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && +@@ -127,7 +130,7 @@ gotit: + + inet_sk(sk)->num = snum; + if (sk_unhashed(sk)) { +- sk_add_node(sk, &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]); ++ sk_add_node(sk, &udp_hash[udp_hashfn(snum, VEID(env))]); + sock_prot_inc_use(sk->sk_prot); + } + write_unlock_bh(&udp_hash_lock); +diff -uprN linux-2.6.15.orig/net/netfilter/core.c linux-2.6.15-ve025stab014/net/netfilter/core.c +--- linux-2.6.15.orig/net/netfilter/core.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/netfilter/core.c 2006-01-27 14:48:08.000000000 +0300 +@@ -32,16 +32,24 @@ + * of skbuffs queued for userspace, and not deregister a hook unless + * this is zero, but that sucks. Now, we simply check when the + * packets come back: if the hook is gone, the packet is discarded. */ ++static DEFINE_SPINLOCK(nf_hook_lock); ++ + struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; + EXPORT_SYMBOL(nf_hooks); +-static DEFINE_SPINLOCK(nf_hook_lock); ++#ifdef CONFIG_VE_IPTABLES ++#define ve_nf_hooks \ ++ ((struct list_head (*)[NF_MAX_HOOKS])(get_exec_env()->_nf_hooks)) ++#else ++#define ve_nf_hooks nf_hooks ++#endif ++ + + int nf_register_hook(struct nf_hook_ops *reg) + { + struct list_head *i; + + spin_lock_bh(&nf_hook_lock); +- list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) { ++ list_for_each(i, &ve_nf_hooks[reg->pf][reg->hooknum]) { + if (reg->priority < ((struct nf_hook_ops *)i)->priority) + break; + } +@@ -53,6 +61,33 @@ int nf_register_hook(struct nf_hook_ops + } + EXPORT_SYMBOL(nf_register_hook); + ++int virt_nf_register_hook(struct nf_hook_ops *reg) ++{ ++ int ret = 0; ++ ++ if (!ve_is_super(get_exec_env())) { ++ struct nf_hook_ops *tmp; ++ ret = -ENOMEM; ++ tmp = kmalloc(sizeof(struct nf_hook_ops), GFP_KERNEL); ++ if (!tmp) ++ goto nomem; ++ memcpy(tmp, reg, sizeof(struct nf_hook_ops)); ++ reg = tmp; ++ } ++ ++ ret = nf_register_hook(reg); ++ if (ret) ++ goto out; ++ ++ return 0; ++out: ++ if (!ve_is_super(get_exec_env())) ++ kfree(reg); ++nomem: ++ return ret; ++} ++EXPORT_SYMBOL(virt_nf_register_hook); ++ + void nf_unregister_hook(struct nf_hook_ops *reg) + { + spin_lock_bh(&nf_hook_lock); +@@ -63,6 +98,29 @@ void nf_unregister_hook(struct nf_hook_o + } + EXPORT_SYMBOL(nf_unregister_hook); + ++int virt_nf_unregister_hook(struct nf_hook_ops *reg) ++{ ++ struct nf_hook_ops *i; ++ ++ spin_lock_bh(&nf_hook_lock); ++ list_for_each_entry(i, &ve_nf_hooks[reg->pf][reg->hooknum], list) { ++ if (reg->hook == i->hook) { ++ reg = i; ++ break; ++ } ++ } ++ spin_unlock_bh(&nf_hook_lock); ++ if (reg != i) ++ return -ENOENT; ++ ++ nf_unregister_hook(reg); ++ ++ if (!ve_is_super(get_exec_env())) ++ kfree(reg); ++ return 0; ++} ++EXPORT_SYMBOL(virt_nf_unregister_hook); ++ + unsigned int nf_iterate(struct list_head *head, + struct sk_buff **skb, + int hook, +@@ -120,9 +178,9 @@ int nf_hook_slow(int pf, unsigned int ho + /* We may already have this, but read-locks nest anyway */ + rcu_read_lock(); + +- elem = &nf_hooks[pf][hook]; ++ elem = &ve_nf_hooks[pf][hook]; + next_hook: +- verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev, ++ verdict = nf_iterate(&ve_nf_hooks[pf][hook], pskb, hook, indev, + outdev, &elem, okfn, hook_thresh); + if (verdict == NF_ACCEPT || verdict == NF_STOP) { + ret = 1; +@@ -195,13 +253,54 @@ struct proc_dir_entry *proc_net_netfilte + EXPORT_SYMBOL(proc_net_netfilter); + #endif + +-void __init netfilter_init(void) ++void init_nf_hooks(struct list_head (*nh)[NF_MAX_HOOKS]) + { + int i, h; + for (i = 0; i < NPROTO; i++) { + for (h = 0; h < NF_MAX_HOOKS; h++) +- INIT_LIST_HEAD(&nf_hooks[i][h]); ++ INIT_LIST_HEAD(&ve_nf_hooks[i][h]); + } ++} ++ ++int init_netfilter(void) ++{ ++#ifdef CONFIG_VE_IPTABLES ++ struct ve_struct *envid; ++ ++ envid = get_exec_env(); ++ envid->_nf_hooks = kmalloc(sizeof(nf_hooks), GFP_KERNEL); ++ if (envid->_nf_hooks == NULL) ++ return -ENOMEM; ++ ++ /* FIXME: charge ubc */ ++ ++ init_nf_hooks(envid->_nf_hooks); ++ return 0; ++#else ++ init_nf_hooks(nf_hooks); ++ return 0; ++#endif ++} ++EXPORT_SYMBOL(init_netfilter); ++ ++#ifdef CONFIG_VE_IPTABLES ++void fini_netfilter(void) ++{ ++ struct ve_struct *envid; ++ ++ envid = get_exec_env(); ++ if (envid->_nf_hooks != NULL) ++ kfree(envid->_nf_hooks); ++ envid->_nf_hooks = NULL; ++ ++ /* FIXME: uncharge ubc */ ++} ++EXPORT_SYMBOL(fini_netfilter); ++#endif ++ ++void __init netfilter_init(void) ++{ ++ init_netfilter(); + + #ifdef CONFIG_PROC_FS + proc_net_netfilter = proc_mkdir("netfilter", proc_net); +@@ -214,3 +313,4 @@ void __init netfilter_init(void) + if (netfilter_log_init() < 0) + panic("cannot initialize nf_log"); + } ++ +diff -uprN linux-2.6.15.orig/net/netfilter/nf_queue.c linux-2.6.15-ve025stab014/net/netfilter/nf_queue.c +--- linux-2.6.15.orig/net/netfilter/nf_queue.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/netfilter/nf_queue.c 2006-01-27 14:48:08.000000000 +0300 +@@ -207,12 +207,12 @@ void nf_reinject(struct sk_buff *skb, st + /* Drop reference to owner of hook which queued us. */ + module_put(info->elem->owner); + +- list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) { ++ list_for_each_rcu(i, &ve_nf_hooks[info->pf][info->hook]) { + if (i == elem) + break; + } + +- if (elem == &nf_hooks[info->pf][info->hook]) { ++ if (elem == &ve_nf_hooks[info->pf][info->hook]) { + /* The module which sent it to userspace is gone. */ + NFDEBUG("%s: module disappeared, dropping packet.\n", + __FUNCTION__); +@@ -227,7 +227,7 @@ void nf_reinject(struct sk_buff *skb, st + + if (verdict == NF_ACCEPT) { + next_hook: +- verdict = nf_iterate(&nf_hooks[info->pf][info->hook], ++ verdict = nf_iterate(&ve_nf_hooks[info->pf][info->hook], + &skb, info->hook, + info->indev, info->outdev, &elem, + info->okfn, INT_MIN); +diff -uprN linux-2.6.15.orig/net/netfilter/nf_sockopt.c linux-2.6.15-ve025stab014/net/netfilter/nf_sockopt.c +--- linux-2.6.15.orig/net/netfilter/nf_sockopt.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/netfilter/nf_sockopt.c 2006-01-27 14:48:08.000000000 +0300 +@@ -80,6 +80,12 @@ static int nf_sockopt(struct sock *sk, i + struct nf_sockopt_ops *ops; + int ret; + ++#ifdef CONFIG_VE_IPTABLES ++ if (!get_exec_env()->_nf_hooks || ++ !get_exec_env()->_ipt_standard_target) ++ return -ENOPROTOOPT; ++#endif ++ + if (down_interruptible(&nf_sockopt_mutex) != 0) + return -EINTR; + +diff -uprN linux-2.6.15.orig/net/netlink/af_netlink.c linux-2.6.15-ve025stab014/net/netlink/af_netlink.c +--- linux-2.6.15.orig/net/netlink/af_netlink.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/netlink/af_netlink.c 2006-01-27 14:48:08.000000000 +0300 +@@ -60,6 +60,9 @@ + #include <net/scm.h> + #include <net/netlink.h> + ++#include <ub/beancounter.h> ++#include <ub/ub_net.h> ++ + #define Nprintk(a...) + #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) + +@@ -208,7 +211,10 @@ static __inline__ struct sock *netlink_l + read_lock(&nl_table_lock); + head = nl_pid_hashfn(hash, pid); + sk_for_each(sk, node, head) { +- if (nlk_sk(sk)->pid == pid) { ++ /* VEs should find sockets, created by kernel */ ++ if ((nlk_sk(sk)->pid == pid) && ++ (!pid || ve_accessible_strict(VE_OWNER_SK(sk), ++ get_exec_env()))){ + sock_hold(sk); + goto found; + } +@@ -308,7 +314,9 @@ static int netlink_insert(struct sock *s + head = nl_pid_hashfn(hash, pid); + len = 0; + sk_for_each(osk, node, head) { +- if (nlk_sk(osk)->pid == pid) ++ if ((nlk_sk(sk)->pid == pid) && ++ ve_accessible_strict(VE_OWNER_SK(sk), ++ get_exec_env())) + break; + len++; + } +@@ -361,6 +369,8 @@ static int __netlink_create(struct socke + sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1); + if (!sk) + return -ENOMEM; ++ if (ub_other_sock_charge(sk)) ++ goto out_free; + + sock_init_data(sock, sk); + +@@ -371,6 +381,10 @@ static int __netlink_create(struct socke + sk->sk_destruct = netlink_sock_destruct; + sk->sk_protocol = protocol; + return 0; ++ ++out_free: ++ sk_free(sk); ++ return -ENOMEM; + } + + static int netlink_create(struct socket *sock, int protocol) +@@ -402,7 +416,7 @@ static int netlink_create(struct socket + groups = nl_table[protocol].groups; + netlink_unlock_table(); + +- if ((err = __netlink_create(sock, protocol) < 0)) ++ if ((err = __netlink_create(sock, protocol)) < 0) + goto out_module; + + nlk = nlk_sk(sock->sk); +@@ -476,7 +490,7 @@ static int netlink_autobind(struct socke + struct hlist_head *head; + struct sock *osk; + struct hlist_node *node; +- s32 pid = current->tgid; ++ s32 pid = virt_pid(current); + int err; + static s32 rover = -4097; + +@@ -485,7 +499,9 @@ retry: + netlink_table_grab(); + head = nl_pid_hashfn(hash, pid); + sk_for_each(osk, node, head) { +- if (nlk_sk(osk)->pid == pid) { ++ if ((nlk_sk(osk)->pid == pid) && ++ ve_accessible_strict(VE_OWNER_SK(osk), ++ get_exec_env())) { + /* Bind collision, search negative pid values. */ + pid = rover--; + if (rover > -4097) +@@ -510,7 +526,7 @@ retry: + static inline int netlink_capable(struct socket *sock, unsigned int flag) + { + return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) || +- capable(CAP_NET_ADMIN); ++ capable(CAP_VE_NET_ADMIN); + } + + static void +@@ -701,7 +717,8 @@ struct sock *netlink_getsockbyfilp(struc + * 0: continue + * 1: repeat lookup - reference dropped while waiting for socket memory. + */ +-int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long timeo) ++int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, ++ long timeo, struct sock *ssk) + { + struct netlink_sock *nlk; + +@@ -711,7 +728,7 @@ int netlink_attachskb(struct sock *sk, s + test_bit(0, &nlk->state)) { + DECLARE_WAITQUEUE(wait, current); + if (!timeo) { +- if (!nlk->pid) ++ if (!ssk || nlk_sk(ssk)->pid == 0) + netlink_overrun(sk); + sock_put(sk); + kfree_skb(skb); +@@ -796,7 +813,7 @@ retry: + kfree_skb(skb); + return PTR_ERR(sk); + } +- err = netlink_attachskb(sk, skb, nonblock, timeo); ++ err = netlink_attachskb(sk, skb, nonblock, timeo, ssk); + if (err == 1) + goto retry; + if (err) +@@ -843,6 +860,9 @@ static inline int do_one_broadcast(struc + !test_bit(p->group - 1, nlk->groups)) + goto out; + ++ if (!ve_accessible_strict(get_exec_env(), VE_OWNER_SK(sk))) ++ goto out; ++ + if (p->failure) { + netlink_overrun(sk); + goto out; +@@ -940,6 +960,9 @@ static inline int do_one_set_err(struct + !test_bit(p->group - 1, nlk->groups)) + goto out; + ++ if (!ve_accessible_strict(get_exec_env(), VE_OWNER_SK(sk))) ++ goto out; ++ + sk->sk_err = p->code; + sk->sk_error_report(sk); + out: +@@ -1074,12 +1097,17 @@ static int netlink_sendmsg(struct kiocb + struct sock_iocb *siocb = kiocb_to_siocb(kiocb); + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); +- struct sockaddr_nl *addr=msg->msg_name; ++ struct sockaddr_nl *addr = msg->msg_name; + u32 dst_pid; +- u32 dst_group; + struct sk_buff *skb; + int err; + struct scm_cookie scm; ++ struct sock *dstsk; ++ long timeo; ++ int no_ubc, no_buf; ++ unsigned long chargesize; ++ ++ DECLARE_WAITQUEUE(wait, current); + + if (msg->msg_flags&MSG_OOB) + return -EOPNOTSUPP; +@@ -1090,17 +1118,16 @@ static int netlink_sendmsg(struct kiocb + if (err < 0) + return err; + ++ /* Broadcasts from user to kernel are disabled. This is OK ++ * according to ANK */ + if (msg->msg_namelen) { + if (addr->nl_family != AF_NETLINK) + return -EINVAL; + dst_pid = addr->nl_pid; +- dst_group = ffs(addr->nl_groups); +- if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND)) ++ if (addr->nl_groups && !netlink_capable(sock, NL_NONROOT_SEND)) + return -EPERM; +- } else { ++ } else + dst_pid = nlk->dst_pid; +- dst_group = nlk->dst_group; +- } + + if (!nlk->pid) { + err = netlink_autobind(sock); +@@ -1113,12 +1140,12 @@ static int netlink_sendmsg(struct kiocb + goto out; + err = -ENOBUFS; + skb = alloc_skb(len, GFP_KERNEL); +- if (skb==NULL) ++ if (skb == NULL) + goto out; + + NETLINK_CB(skb).pid = nlk->pid; + NETLINK_CB(skb).dst_pid = dst_pid; +- NETLINK_CB(skb).dst_group = dst_group; ++ NETLINK_CB(skb).dst_group = 0; + NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context); + memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); + +@@ -1129,25 +1156,88 @@ static int netlink_sendmsg(struct kiocb + */ + + err = -EFAULT; +- if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) { +- kfree_skb(skb); +- goto out; +- } ++ if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) ++ goto out_free; + + err = security_netlink_send(sk, skb); +- if (err) { +- kfree_skb(skb); +- goto out; ++ if (err) ++ goto out_free; ++ ++ timeo = sock_sndtimeo(sk, msg->msg_flags&MSG_DONTWAIT); ++retry: ++ dstsk = netlink_getsockbypid(sk, dst_pid); ++ if (IS_ERR(dstsk)) { ++ err = PTR_ERR(dstsk); ++ goto out_free; + } + +- if (dst_group) { +- atomic_inc(&skb->users); +- netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL); ++ nlk = nlk_sk(dstsk); ++#ifdef NL_EMULATE_DEV ++ if (nlk->handler) { ++ skb_orphan(skb); ++ err = nlk->handler(protocol, skb); ++ goto out_put; + } +- err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT); ++#endif + ++ /* BTW, it could be done once, before the retry loop */ ++ chargesize = skb_charge_fullsize(skb); ++ no_ubc = ub_sock_getwres_other(sk, chargesize); ++ no_buf = atomic_read(&dstsk->sk_rmem_alloc) > dstsk->sk_rcvbuf || ++ test_bit(0, &nlk->state); ++ if (no_ubc || no_buf) { ++ wait_queue_head_t *sleep; ++ ++ if (!no_ubc) ++ ub_sock_retwres_other(sk, chargesize, ++ SOCK_MIN_UBCSPACE_CH); ++ err = -EAGAIN; ++ if (timeo == 0) { ++ kfree_skb(skb); ++ goto out_put; ++ } ++ ++ /* wake up comes to different queues */ ++ sleep = no_ubc ? sk->sk_sleep : &nlk->wait; ++ __set_current_state(TASK_INTERRUPTIBLE); ++ add_wait_queue(sleep, &wait); ++ ++ /* this if can't be moved upper because ub_sock_snd_queue_add() ++ * may change task state to TASK_RUNNING */ ++ if (no_ubc) ++ ub_sock_sndqueueadd_other(sk, chargesize); ++ ++ if ((atomic_read(&dstsk->sk_rmem_alloc) > dstsk->sk_rcvbuf || ++ test_bit(0, &nlk->state) || no_ubc) && ++ !sock_flag(dstsk, SOCK_DEAD)) ++ timeo = schedule_timeout(timeo); ++ ++ __set_current_state(TASK_RUNNING); ++ remove_wait_queue(sleep, &wait); ++ if (no_ubc) ++ ub_sock_sndqueuedel(sk); ++ sock_put(dstsk); ++ ++ if (!signal_pending(current)) ++ goto retry; ++ err = sock_intr_errno(timeo); ++ goto out_free; ++ } ++ ++ skb_orphan(skb); ++ skb_set_owner_r(skb, dstsk); ++ ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF); ++ skb_queue_tail(&dstsk->sk_receive_queue, skb); ++ dstsk->sk_data_ready(dstsk, len); ++ err = len; ++out_put: ++ sock_put(dstsk); + out: + return err; ++ ++out_free: ++ kfree_skb(skb); ++ return err; + } + + static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, +@@ -1300,6 +1390,10 @@ static int netlink_dump(struct sock *sk) + skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL); + if (!skb) + return -ENOBUFS; ++ if (ub_nlrcvbuf_charge(skb, sk) < 0) { ++ kfree_skb(skb); ++ return -EACCES; ++ } + + spin_lock(&nlk->cb_lock); + +@@ -1422,7 +1516,7 @@ static int netlink_rcv_skb(struct sk_buf + while (skb->len >= nlmsg_total_size(0)) { + nlh = (struct nlmsghdr *) skb->data; + +- if (skb->len < nlh->nlmsg_len) ++ if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) + return 0; + + total_len = min(NLMSG_ALIGN(nlh->nlmsg_len), skb->len); +@@ -1468,8 +1562,15 @@ void netlink_run_queue(struct sock *sk, + *qlen = skb_queue_len(&sk->sk_receive_queue); + + for (; *qlen; (*qlen)--) { ++ int ret; ++ struct ve_struct *old_env; + skb = skb_dequeue(&sk->sk_receive_queue); +- if (netlink_rcv_skb(skb, cb)) { ++ ++ old_env = set_exec_env(VE_OWNER_SKB(skb)); ++ ret = netlink_rcv_skb(skb, cb); ++ (void)set_exec_env(old_env); ++ ++ if (ret) { + if (skb->len) + skb_queue_head(&sk->sk_receive_queue, skb); + else { +@@ -1737,6 +1838,7 @@ enomem: + + sock_register(&netlink_family_ops); + #ifdef CONFIG_PROC_FS ++ /* FIXME: virtualize before give access from VEs */ + proc_net_fops_create("netlink", 0, &netlink_seq_fops); + #endif + /* The netlink device handler may be needed early. */ +diff -uprN linux-2.6.15.orig/net/packet/af_packet.c linux-2.6.15-ve025stab014/net/packet/af_packet.c +--- linux-2.6.15.orig/net/packet/af_packet.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/packet/af_packet.c 2006-01-27 14:48:08.000000000 +0300 +@@ -78,6 +78,8 @@ + #include <linux/module.h> + #include <linux/init.h> + ++#include <ub/ub_net.h> ++ + #ifdef CONFIG_INET + #include <net/inet_common.h> + #endif +@@ -279,7 +281,8 @@ static int packet_rcv_spkt(struct sk_buf + * so that this procedure is noop. + */ + +- if (skb->pkt_type == PACKET_LOOPBACK) ++ if (skb->pkt_type == PACKET_LOOPBACK || ++ !ve_accessible(VE_OWNER_SKB(skb), VE_OWNER_SK(sk))) + goto out; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) +@@ -471,6 +474,9 @@ static int packet_rcv(struct sk_buff *sk + sk = pt->af_packet_priv; + po = pkt_sk(sk); + ++ if (!ve_accessible(VE_OWNER_SKB(skb), VE_OWNER_SK(sk))) ++ goto drop; ++ + skb->dev = dev; + + if (dev->hard_header) { +@@ -530,6 +536,9 @@ static int packet_rcv(struct sk_buff *sk + if (pskb_trim(skb, snaplen)) + goto drop_n_acct; + ++ if (ub_sockrcvbuf_charge(sk, skb)) ++ goto drop_n_acct; ++ + skb_set_owner_r(skb, sk); + skb->dev = NULL; + dst_release(skb->dst); +@@ -580,6 +589,9 @@ static int tpacket_rcv(struct sk_buff *s + sk = pt->af_packet_priv; + po = pkt_sk(sk); + ++ if (!ve_accessible(VE_OWNER_SKB(skb), VE_OWNER_SK(sk))) ++ goto drop; ++ + if (dev->hard_header) { + if (sk->sk_type != SOCK_DGRAM) + skb_push(skb, skb->data - skb->mac.raw); +@@ -629,6 +641,12 @@ static int tpacket_rcv(struct sk_buff *s + if (snaplen > skb->len-skb->data_len) + snaplen = skb->len-skb->data_len; + ++ if (copy_skb && ++ ub_sockrcvbuf_charge(sk, copy_skb)) { ++ spin_lock(&sk->sk_receive_queue.lock); ++ goto ring_is_full; ++ } ++ + spin_lock(&sk->sk_receive_queue.lock); + h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head); + +@@ -1009,6 +1027,8 @@ static int packet_create(struct socket * + sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1); + if (sk == NULL) + goto out; ++ if (ub_other_sock_charge(sk)) ++ goto out_free; + + sock->ops = &packet_ops; + #ifdef CONFIG_SOCK_PACKET +@@ -1047,6 +1067,9 @@ static int packet_create(struct socket * + sk_add_node(sk, &packet_sklist); + write_unlock_bh(&packet_sklist_lock); + return(0); ++ ++out_free: ++ sk_free(sk); + out: + return err; + } +@@ -1429,11 +1452,16 @@ static int packet_notifier(struct notifi + struct sock *sk; + struct hlist_node *node; + struct net_device *dev = (struct net_device*)data; ++ struct ve_struct *ve; + ++ ve = get_exec_env(); + read_lock(&packet_sklist_lock); + sk_for_each(sk, node, &packet_sklist) { + struct packet_sock *po = pkt_sk(sk); + ++ if (!ve_accessible_strict(VE_OWNER_SK(sk), ve)) ++ continue; ++ + switch (msg) { + case NETDEV_UNREGISTER: + #ifdef CONFIG_PACKET_MULTICAST +@@ -1844,6 +1872,8 @@ static inline struct sock *packet_seq_id + struct hlist_node *node; + + sk_for_each(s, node, &packet_sklist) { ++ if (!ve_accessible(VE_OWNER_SK(s), get_exec_env())) ++ continue; + if (!off--) + return s; + } +@@ -1859,9 +1889,13 @@ static void *packet_seq_start(struct seq + static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) + { + ++*pos; +- return (v == SEQ_START_TOKEN) +- ? sk_head(&packet_sklist) +- : sk_next((struct sock*)v) ; ++ do { ++ v = (v == SEQ_START_TOKEN) ++ ? sk_head(&packet_sklist) ++ : sk_next((struct sock*)v); ++ } while (v != NULL && ++ !ve_accessible(VE_OWNER_SK((struct sock*)v), get_exec_env())); ++ return v; + } + + static void packet_seq_stop(struct seq_file *seq, void *v) +@@ -1917,7 +1951,7 @@ static struct file_operations packet_seq + + static void __exit packet_exit(void) + { +- proc_net_remove("packet"); ++ remove_proc_glob_entry("net/packet", NULL); + unregister_netdevice_notifier(&packet_netdev_notifier); + sock_unregister(PF_PACKET); + proto_unregister(&packet_proto); +@@ -1932,7 +1966,7 @@ static int __init packet_init(void) + + sock_register(&packet_family_ops); + register_netdevice_notifier(&packet_netdev_notifier); +- proc_net_fops_create("packet", 0, &packet_seq_fops); ++ proc_glob_fops_create("net/packet", 0, &packet_seq_fops); + out: + return rc; + } +diff -uprN linux-2.6.15.orig/net/sched/sch_generic.c linux-2.6.15-ve025stab014/net/sched/sch_generic.c +--- linux-2.6.15.orig/net/sched/sch_generic.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/sched/sch_generic.c 2006-01-27 14:48:08.000000000 +0300 +@@ -97,6 +97,7 @@ int qdisc_restart(struct net_device *dev + + /* Dequeue packet */ + if ((skb = q->dequeue(q)) != NULL) { ++ struct ve_struct *envid; + unsigned nolock = (dev->features & NETIF_F_LLTX); + /* + * When the driver has LLTX set it does its own locking +@@ -107,6 +108,7 @@ int qdisc_restart(struct net_device *dev + * of lock congestion it should return -1 and the packet + * will be requeued. + */ ++ envid = set_exec_env(VE_OWNER_SKB(skb)); + if (!nolock) { + if (!spin_trylock(&dev->xmit_lock)) { + collision: +@@ -121,6 +123,7 @@ int qdisc_restart(struct net_device *dev + kfree_skb(skb); + if (net_ratelimit()) + printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name); ++ (void)set_exec_env(envid); + return -1; + } + __get_cpu_var(netdev_rx_stat).cpu_collision++; +@@ -146,6 +149,7 @@ int qdisc_restart(struct net_device *dev + spin_unlock(&dev->xmit_lock); + } + spin_lock(&dev->queue_lock); ++ (void)set_exec_env(envid); + return -1; + } + if (ret == NETDEV_TX_LOCKED && nolock) { +@@ -177,6 +181,7 @@ int qdisc_restart(struct net_device *dev + requeue: + q->ops->requeue(skb, q); + netif_schedule(dev); ++ (void)set_exec_env(envid); + return 1; + } + BUG_ON((int) q->q.qlen < 0); +@@ -625,3 +630,4 @@ EXPORT_SYMBOL(qdisc_reset); + EXPORT_SYMBOL(qdisc_restart); + EXPORT_SYMBOL(qdisc_lock_tree); + EXPORT_SYMBOL(qdisc_unlock_tree); ++EXPORT_SYMBOL(dev_shutdown); +diff -uprN linux-2.6.15.orig/net/sched/sch_teql.c linux-2.6.15-ve025stab014/net/sched/sch_teql.c +--- linux-2.6.15.orig/net/sched/sch_teql.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/sched/sch_teql.c 2006-01-27 14:48:08.000000000 +0300 +@@ -188,6 +188,9 @@ static int teql_qdisc_init(struct Qdisc + struct teql_master *m = (struct teql_master*)sch->ops; + struct teql_sched_data *q = qdisc_priv(sch); + ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ + if (dev->hard_header_len > m->dev->hard_header_len) + return -EINVAL; + +diff -uprN linux-2.6.15.orig/net/socket.c linux-2.6.15-ve025stab014/net/socket.c +--- linux-2.6.15.orig/net/socket.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/socket.c 2006-01-27 14:48:08.000000000 +0300 +@@ -84,6 +84,7 @@ + #include <linux/compat.h> + #include <linux/kmod.h> + #include <linux/audit.h> ++#include <linux/in.h> + + #ifdef CONFIG_NET_RADIO + #include <linux/wireless.h> /* Note : will define WIRELESS_EXT */ +@@ -1072,6 +1073,37 @@ int sock_wake_async(struct socket *sock, + return 0; + } + ++int vz_security_proto_check(int family, int type, int protocol) ++{ ++#ifdef CONFIG_VE ++ if (ve_is_super(get_exec_env())) ++ return 0; ++ ++ switch (family) { ++ case PF_UNSPEC: ++ case PF_PACKET: ++ case PF_NETLINK: ++ case PF_UNIX: ++ break; ++ case PF_INET: ++ switch (protocol) { ++ case IPPROTO_IP: ++ case IPPROTO_ICMP: ++ case IPPROTO_TCP: ++ case IPPROTO_UDP: ++ case IPPROTO_RAW: ++ break; ++ default: ++ return -EAFNOSUPPORT; ++ } ++ break; ++ default: ++ return -EAFNOSUPPORT; ++ } ++#endif ++ return 0; ++} ++ + static int __sock_create(int family, int type, int protocol, struct socket **res, int kern) + { + int err; +@@ -1099,6 +1131,11 @@ static int __sock_create(int family, int + family = PF_PACKET; + } + ++ /* VZ compatibility layer */ ++ err = vz_security_proto_check(family, type, protocol); ++ if (err < 0) ++ return err; ++ + err = security_socket_create(family, type, protocol, kern); + if (err) + return err; +diff -uprN linux-2.6.15.orig/net/sunrpc/clnt.c linux-2.6.15-ve025stab014/net/sunrpc/clnt.c +--- linux-2.6.15.orig/net/sunrpc/clnt.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/sunrpc/clnt.c 2006-01-27 14:48:08.000000000 +0300 +@@ -168,10 +168,10 @@ rpc_new_client(struct rpc_xprt *xprt, ch + } + + /* save the nodename */ +- clnt->cl_nodelen = strlen(system_utsname.nodename); ++ clnt->cl_nodelen = strlen(ve_utsname.nodename); + if (clnt->cl_nodelen > UNX_MAXNODENAME) + clnt->cl_nodelen = UNX_MAXNODENAME; +- memcpy(clnt->cl_nodename, system_utsname.nodename, clnt->cl_nodelen); ++ memcpy(clnt->cl_nodename, ve_utsname.nodename, clnt->cl_nodelen); + return clnt; + + out_no_auth: +diff -uprN linux-2.6.15.orig/net/sunrpc/sched.c linux-2.6.15-ve025stab014/net/sunrpc/sched.c +--- linux-2.6.15.orig/net/sunrpc/sched.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/sunrpc/sched.c 2006-01-27 14:48:08.000000000 +0300 +@@ -592,7 +592,9 @@ static int rpc_wait_bit_interruptible(vo + static int __rpc_execute(struct rpc_task *task) + { + int status = 0; ++ struct ve_struct *env; + ++ env = set_exec_env(get_ve0()); + dprintk("RPC: %4d rpc_execute flgs %x\n", + task->tk_pid, task->tk_flags); + +@@ -681,6 +683,7 @@ static int __rpc_execute(struct rpc_task + + /* Release all resources associated with the task */ + rpc_release_task(task); ++ (void)set_exec_env(env); + return status; + } + +diff -uprN linux-2.6.15.orig/net/sunrpc/svcsock.c linux-2.6.15-ve025stab014/net/sunrpc/svcsock.c +--- linux-2.6.15.orig/net/sunrpc/svcsock.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/sunrpc/svcsock.c 2006-01-27 14:48:08.000000000 +0300 +@@ -361,6 +361,9 @@ svc_sendto(struct svc_rqst *rqstp, struc + size_t base = xdr->page_base; + unsigned int pglen = xdr->page_len; + unsigned int flags = MSG_MORE; ++ struct ve_struct *old_env; ++ ++ old_env = set_exec_env(get_ve0()); + + slen = xdr->len; + +@@ -425,6 +428,8 @@ out: + rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, + rqstp->rq_addr.sin_addr.s_addr); + ++ (void)set_exec_env(old_env); ++ + return len; + } + +@@ -437,9 +442,12 @@ svc_recv_available(struct svc_sock *svsk + mm_segment_t oldfs; + struct socket *sock = svsk->sk_sock; + int avail, err; ++ struct ve_struct *old_env; + + oldfs = get_fs(); set_fs(KERNEL_DS); ++ old_env = set_exec_env(get_ve0()); + err = sock->ops->ioctl(sock, TIOCINQ, (unsigned long) &avail); ++ (void)set_exec_env(old_env); + set_fs(oldfs); + + return (err >= 0)? avail : err; +@@ -454,6 +462,7 @@ svc_recvfrom(struct svc_rqst *rqstp, str + struct msghdr msg; + struct socket *sock; + int len, alen; ++ struct ve_struct *old_env; + + rqstp->rq_addrlen = sizeof(rqstp->rq_addr); + sock = rqstp->rq_sock->sk_sock; +@@ -465,7 +474,9 @@ svc_recvfrom(struct svc_rqst *rqstp, str + + msg.msg_flags = MSG_DONTWAIT; + ++ old_env = set_exec_env(get_ve0()); + len = kernel_recvmsg(sock, &msg, iov, nr, buflen, MSG_DONTWAIT); ++ (void)set_exec_env(get_ve0()); + + /* sock_recvmsg doesn't fill in the name/namelen, so we must.. + * possibly we should cache this in the svc_sock structure +@@ -761,17 +772,19 @@ svc_tcp_accept(struct svc_sock *svsk) + struct proto_ops *ops; + struct svc_sock *newsvsk; + int err, slen; ++ struct ve_struct *old_env; + + dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); + if (!sock) + return; + ++ old_env = set_exec_env(get_ve0()); + err = sock_create_lite(PF_INET, SOCK_STREAM, IPPROTO_TCP, &newsock); + if (err) { + if (err == -ENOMEM) + printk(KERN_WARNING "%s: no more sockets!\n", + serv->sv_name); +- return; ++ goto restore; + } + + dprintk("svc: tcp_accept %p allocated\n", newsock); +@@ -865,6 +878,8 @@ svc_tcp_accept(struct svc_sock *svsk) + + } + ++ (void)set_exec_env(old_env); ++ + if (serv->sv_stats) + serv->sv_stats->nettcpconn++; + +@@ -872,6 +887,8 @@ svc_tcp_accept(struct svc_sock *svsk) + + failed: + sock_release(newsock); ++restore: ++ (void)set_exec_env(old_env); + return; + } + +@@ -1388,6 +1405,7 @@ svc_create_socket(struct svc_serv *serv, + struct socket *sock; + int error; + int type; ++ struct ve_struct *old_env; + + dprintk("svc: svc_create_socket(%s, %d, %u.%u.%u.%u:%d)\n", + serv->sv_program->pg_name, protocol, +@@ -1401,8 +1419,10 @@ svc_create_socket(struct svc_serv *serv, + } + type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + ++ old_env = set_exec_env(get_ve0()); ++ + if ((error = sock_create_kern(PF_INET, type, protocol, &sock)) < 0) +- return error; ++ goto restore; + + if (sin != NULL) { + if (type == SOCK_STREAM) +@@ -1418,12 +1438,16 @@ svc_create_socket(struct svc_serv *serv, + goto bummer; + } + +- if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL) ++ if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL) { ++ (void)set_exec_env(old_env); + return 0; ++ } + + bummer: + dprintk("svc: svc_create_socket error = %d\n", -error); + sock_release(sock); ++restore: ++ (void)set_exec_env(old_env); + return error; + } + +diff -uprN linux-2.6.15.orig/net/unix/af_unix.c linux-2.6.15-ve025stab014/net/unix/af_unix.c +--- linux-2.6.15.orig/net/unix/af_unix.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/net/unix/af_unix.c 2006-01-27 14:48:08.000000000 +0300 +@@ -118,6 +118,9 @@ + #include <net/checksum.h> + #include <linux/security.h> + ++#include <ub/ub_net.h> ++#include <ub/beancounter.h> ++ + int sysctl_unix_max_dgram_qlen = 10; + + struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; +@@ -235,6 +238,8 @@ static struct sock *__unix_find_socket_b + sk_for_each(s, node, &unix_socket_table[hash ^ type]) { + struct unix_sock *u = unix_sk(s); + ++ if (!ve_accessible(VE_OWNER_SK(s), get_exec_env())) ++ continue; + if (u->addr->len == len && + !memcmp(u->addr->name, sunname, len)) + goto found; +@@ -439,7 +444,7 @@ static int unix_listen(struct socket *so + sk->sk_max_ack_backlog = backlog; + sk->sk_state = TCP_LISTEN; + /* set credentials so connect can copy them */ +- sk->sk_peercred.pid = current->tgid; ++ sk->sk_peercred.pid = virt_tgid(current); + sk->sk_peercred.uid = current->euid; + sk->sk_peercred.gid = current->egid; + err = 0; +@@ -553,6 +558,8 @@ static struct sock * unix_create1(struct + sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1); + if (!sk) + goto out; ++ if (ub_other_sock_charge(sk)) ++ goto out_sk_free; + + atomic_inc(&unix_nr_socks); + +@@ -571,6 +578,9 @@ static struct sock * unix_create1(struct + unix_insert_socket(unix_sockets_unbound, sk); + out: + return sk; ++out_sk_free: ++ sk_free(sk); ++ return NULL; + } + + static int unix_create(struct socket *sock, int protocol) +@@ -932,6 +942,7 @@ static int unix_stream_connect(struct so + int st; + int err; + long timeo; ++ unsigned long chargesize; + + err = unix_mkname(sunaddr, addr_len, &hash); + if (err < 0) +@@ -960,6 +971,10 @@ static int unix_stream_connect(struct so + skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); + if (skb == NULL) + goto out; ++ chargesize = skb_charge_fullsize(skb); ++ if (ub_sock_getwres_other(newsk, chargesize) < 0) ++ goto out; ++ ub_skb_set_charge(skb, newsk, chargesize, UB_OTHERSOCKBUF); + + restart: + /* Find listening sock. */ +@@ -1043,7 +1058,7 @@ restart: + unix_peer(newsk) = sk; + newsk->sk_state = TCP_ESTABLISHED; + newsk->sk_type = sk->sk_type; +- newsk->sk_peercred.pid = current->tgid; ++ newsk->sk_peercred.pid = virt_tgid(current); + newsk->sk_peercred.uid = current->euid; + newsk->sk_peercred.gid = current->egid; + newu = unix_sk(newsk); +@@ -1105,7 +1120,7 @@ static int unix_socketpair(struct socket + sock_hold(skb); + unix_peer(ska)=skb; + unix_peer(skb)=ska; +- ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid; ++ ska->sk_peercred.pid = skb->sk_peercred.pid = virt_tgid(current); + ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid; + ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid; + +@@ -1431,6 +1446,16 @@ static int unix_stream_sendmsg(struct ki + + size=len-sent; + ++ if (msg->msg_flags & MSG_DONTWAIT) ++ ub_sock_makewres_other(sk, skb_charge_size(size)); ++ if (sock_bc(sk) != NULL && ++ sock_bc(sk)->poll_reserv >= ++ SOCK_MIN_UBCSPACE && ++ skb_charge_size(size) > ++ sock_bc(sk)->poll_reserv) ++ size = skb_charge_datalen(sock_bc(sk)->poll_reserv); ++ ++ + /* Keep two messages in the pipe so it schedules better */ + if (size > sk->sk_sndbuf / 2 - 64) + size = sk->sk_sndbuf / 2 - 64; +@@ -1442,7 +1467,8 @@ static int unix_stream_sendmsg(struct ki + * Grab a buffer + */ + +- skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err); ++ skb = sock_alloc_send_skb2(sk, size, SOCK_MIN_UBCSPACE, ++ msg->msg_flags&MSG_DONTWAIT, &err); + + if (skb==NULL) + goto out_err; +@@ -1870,6 +1896,7 @@ static unsigned int unix_poll(struct fil + { + struct sock *sk = sock->sk; + unsigned int mask; ++ int no_ub_res; + + poll_wait(file, sk->sk_sleep, wait); + mask = 0; +@@ -1880,6 +1907,10 @@ static unsigned int unix_poll(struct fil + if (sk->sk_shutdown == SHUTDOWN_MASK) + mask |= POLLHUP; + ++ no_ub_res = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH); ++ if (no_ub_res) ++ ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH); ++ + /* readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue) || + (sk->sk_shutdown & RCV_SHUTDOWN)) +@@ -1893,7 +1924,7 @@ static unsigned int unix_poll(struct fil + * we set writable also when the other side has shut down the + * connection. This prevents stuck sockets. + */ +- if (unix_writable(sk)) ++ if (!no_ub_res && unix_writable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + + return mask; +@@ -2045,7 +2076,7 @@ static int __init af_unix_init(void) + + sock_register(&unix_family_ops); + #ifdef CONFIG_PROC_FS +- proc_net_fops_create("unix", 0, &unix_seq_fops); ++ proc_glob_fops_create("net/unix", 0, &unix_seq_fops); + #endif + unix_sysctl_register(); + out: +@@ -2056,7 +2087,7 @@ static void __exit af_unix_exit(void) + { + sock_unregister(PF_UNIX); + unix_sysctl_unregister(); +- proc_net_remove("unix"); ++ remove_proc_glob_entry("net/unix", NULL); + proto_unregister(&unix_proto); + } + +diff -uprN linux-2.6.15.orig/security/commoncap.c linux-2.6.15-ve025stab014/security/commoncap.c +--- linux-2.6.15.orig/security/commoncap.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/security/commoncap.c 2006-01-27 14:48:08.000000000 +0300 +@@ -34,7 +34,7 @@ EXPORT_SYMBOL(cap_netlink_send); + + int cap_netlink_recv(struct sk_buff *skb) + { +- if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) ++ if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_VE_NET_ADMIN)) + return -EPERM; + return 0; + } +@@ -311,7 +311,7 @@ void cap_task_reparent_to_init (struct t + + int cap_syslog (int type) + { +- if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN)) ++ if ((type != 3 && type != 10) && !capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + return 0; + } +diff -uprN linux-2.6.15.orig/security/selinux/hooks.c linux-2.6.15-ve025stab014/security/selinux/hooks.c +--- linux-2.6.15.orig/security/selinux/hooks.c 2006-01-03 06:21:10.000000000 +0300 ++++ linux-2.6.15-ve025stab014/security/selinux/hooks.c 2006-01-27 14:48:08.000000000 +0300 +@@ -4199,12 +4199,12 @@ static int selinux_setprocattr(struct ta + struct task_struct *g, *t; + struct mm_struct *mm = p->mm; + read_lock(&tasklist_lock); +- do_each_thread(g, t) ++ do_each_thread_ve(g, t) + if (t->mm == mm && t != p) { + read_unlock(&tasklist_lock); + return -EPERM; + } +- while_each_thread(g, t); ++ while_each_thread_ve(g, t); + read_unlock(&tasklist_lock); + } + |