--- sys/amd64/amd64/cpu_switch.S.orig +++ sys/amd64/amd64/cpu_switch.S @@ -105,10 +105,10 @@ /* have we used fp, and need a save? */ cmpq %rdi,PCPU(FPCURTHREAD) - jne 3f + jne 2f movq PCB_SAVEFPU(%r8),%r8 clts - cmpl $0,use_xsave + cmpl $0,use_xsave(%rip) jne 1f fxsave (%r8) jmp 2f @@ -120,12 +120,7 @@ /* This is patched to xsaveopt if supported, see fpuinit_bsp1() */ xsave (%r8) movq %rcx,%rdx -2: smsw %ax - orb $CR0_TS,%al - lmsw %ax - xorl %eax,%eax - movq %rax,PCPU(FPCURTHREAD) -3: +2: /* Save is done. Now fire up new thread. Leave old vmspace. */ movq %rsi,%r12 movq %rdi,%r13 @@ -212,6 +207,8 @@ movq PCB_RBX(%r8),%rbx movq PCB_RIP(%r8),%rax movq %rax,(%rsp) + movq PCPU(CURTHREAD),%rdi + call fpu_activate_sw ret /* --- sys/amd64/amd64/fpu.c.orig +++ sys/amd64/amd64/fpu.c @@ -139,6 +139,11 @@ SYSCTL_INT(_hw, HW_FLOATINGPT, floatingpoint, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 1, "Floating point instructions executed in hardware"); +int lazy_fpu_switch = 0; +SYSCTL_INT(_hw, OID_AUTO, lazy_fpu_switch, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, + &lazy_fpu_switch, 0, + "Lazily load FPU context after context switch"); + int use_xsave; /* non-static for cpu_switch.S */ uint64_t xsave_mask; /* the same */ static uma_zone_t fpu_save_area_zone; @@ -204,6 +209,7 @@ u_int cp[4]; uint64_t xsave_mask_user; + TUNABLE_INT_FETCH("hw.lazy_fpu_switch", &lazy_fpu_switch); if ((cpu_feature2 & CPUID2_XSAVE) != 0) { use_xsave = 1; TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave); @@ -611,6 +617,45 @@ return (fpetable[(mxcsr & (~mxcsr >> 7)) & 0x3f]); } +static void +restore_fpu_curthread(struct thread *td) +{ + struct pcb *pcb; + + /* + * Record new context early in case frstor causes a trap. + */ + PCPU_SET(fpcurthread, td); + + stop_emulating(); + fpu_clean_state(); + pcb = td->td_pcb; + + if ((pcb->pcb_flags & PCB_FPUINITDONE) == 0) { + /* + * This is the first time this thread has used the FPU or + * the PCB doesn't contain a clean FPU state. Explicitly + * load an initial state. + * + * We prefer to restore the state from the actual save + * area in PCB instead of directly loading from + * fpu_initialstate, to ignite the XSAVEOPT + * tracking engine. + */ + bcopy(fpu_initialstate, pcb->pcb_save, + cpu_max_ext_state_size); + fpurestore(pcb->pcb_save); + if (pcb->pcb_initial_fpucw != __INITIAL_FPUCW__) + fldcw(pcb->pcb_initial_fpucw); + if (PCB_USER_FPU(pcb)) + set_pcb_flags(pcb, PCB_FPUINITDONE | + PCB_USERFPUINITDONE); + else + set_pcb_flags(pcb, PCB_FPUINITDONE); + } else + fpurestore(pcb->pcb_save); +} + /* * Device Not Available (DNA, #NM) exception handler. * @@ -621,7 +666,9 @@ void fpudna(void) { + struct thread *td; + td = curthread; /* * This handler is entered with interrupts enabled, so context * switches may occur before critical_enter() is executed. If @@ -635,49 +682,38 @@ KASSERT((curpcb->pcb_flags & PCB_FPUNOSAVE) == 0, ("fpudna while in fpu_kern_enter(FPU_KERN_NOCTX)")); - if (PCPU_GET(fpcurthread) == curthread) { - printf("fpudna: fpcurthread == curthread\n"); + if (__predict_false(PCPU_GET(fpcurthread) == td)) { + /* + * Some virtual machines seems to set %cr0.TS at + * arbitrary moments. Silently clear the TS bit + * regardless of the eager/lazy FPU context switch + * mode. + */ stop_emulating(); - critical_exit(); - return; + } else { + if (__predict_false(PCPU_GET(fpcurthread) != NULL)) { + panic( + "fpudna: fpcurthread = %p (%d), curthread = %p (%d)\n", + PCPU_GET(fpcurthread), + PCPU_GET(fpcurthread)->td_tid, td, td->td_tid); + } + restore_fpu_curthread(td); } - if (PCPU_GET(fpcurthread) != NULL) { - panic("fpudna: fpcurthread = %p (%d), curthread = %p (%d)\n", - PCPU_GET(fpcurthread), PCPU_GET(fpcurthread)->td_tid, - curthread, curthread->td_tid); - } - stop_emulating(); - /* - * Record new context early in case frstor causes a trap. - */ - PCPU_SET(fpcurthread, curthread); + critical_exit(); +} - fpu_clean_state(); +void fpu_activate_sw(struct thread *td); /* Called from the context switch */ +void +fpu_activate_sw(struct thread *td) +{ - if ((curpcb->pcb_flags & PCB_FPUINITDONE) == 0) { - /* - * This is the first time this thread has used the FPU or - * the PCB doesn't contain a clean FPU state. Explicitly - * load an initial state. - * - * We prefer to restore the state from the actual save - * area in PCB instead of directly loading from - * fpu_initialstate, to ignite the XSAVEOPT - * tracking engine. - */ - bcopy(fpu_initialstate, curpcb->pcb_save, - cpu_max_ext_state_size); - fpurestore(curpcb->pcb_save); - if (curpcb->pcb_initial_fpucw != __INITIAL_FPUCW__) - fldcw(curpcb->pcb_initial_fpucw); - if (PCB_USER_FPU(curpcb)) - set_pcb_flags(curpcb, - PCB_FPUINITDONE | PCB_USERFPUINITDONE); - else - set_pcb_flags(curpcb, PCB_FPUINITDONE); - } else - fpurestore(curpcb->pcb_save); - critical_exit(); + if (lazy_fpu_switch || (td->td_pflags & TDP_KTHREAD) != 0 || + !PCB_USER_FPU(td->td_pcb)) { + PCPU_SET(fpcurthread, NULL); + start_emulating(); + } else if (PCPU_GET(fpcurthread) != td) { + restore_fpu_curthread(td); + } } void --- sys/i386/i386/swtch.s.orig +++ sys/i386/i386/swtch.s @@ -293,6 +293,12 @@ cpu_switch_load_gs: mov PCB_GS(%edx),%gs + pushl %edx + pushl PCPU(CURTHREAD) + call npxswitch + popl %edx + popl %edx + /* Test if debug registers should be restored. */ testl $PCB_DBREGS,PCB_FLAGS(%edx) jz 1f --- sys/i386/isa/npx.c.orig +++ sys/i386/isa/npx.c @@ -191,6 +191,11 @@ SYSCTL_INT(_hw, HW_FLOATINGPT, floatingpoint, CTLFLAG_RD, &hw_float, 0, "Floating point instructions executed in hardware"); +int lazy_fpu_switch = 0; +SYSCTL_INT(_hw, OID_AUTO, lazy_fpu_switch, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, + &lazy_fpu_switch, 0, + "Lazily load FPU context after context switch"); + int use_xsave; uint64_t xsave_mask; static uma_zone_t fpu_save_area_zone; @@ -327,6 +332,7 @@ u_int cp[4]; uint64_t xsave_mask_user; + TUNABLE_INT_FETCH("hw.lazy_fpu_switch", &lazy_fpu_switch); if (cpu_fxsr && (cpu_feature2 & CPUID2_XSAVE) != 0) { use_xsave = 1; TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave); @@ -785,47 +791,20 @@ return (fpetable[(mxcsr & (~mxcsr >> 7)) & 0x3f]); } -/* - * Implement device not available (DNA) exception - * - * It would be better to switch FP context here (if curthread != fpcurthread) - * and not necessarily for every context switch, but it is too hard to - * access foreign pcb's. - */ - -static int err_count = 0; - -int -npxdna(void) +static void +restore_npx_curthread(struct thread *td, struct pcb *pcb) { - if (!hw_float) - return (0); - critical_enter(); - if (PCPU_GET(fpcurthread) == curthread) { - printf("npxdna: fpcurthread == curthread %d times\n", - ++err_count); - stop_emulating(); - critical_exit(); - return (1); - } - if (PCPU_GET(fpcurthread) != NULL) { - printf("npxdna: fpcurthread = %p (%d), curthread = %p (%d)\n", - PCPU_GET(fpcurthread), - PCPU_GET(fpcurthread)->td_proc->p_pid, - curthread, curthread->td_proc->p_pid); - panic("npxdna"); - } - stop_emulating(); /* * Record new context early in case frstor causes a trap. */ - PCPU_SET(fpcurthread, curthread); + PCPU_SET(fpcurthread, td); + stop_emulating(); if (cpu_fxsr) fpu_clean_state(); - if ((curpcb->pcb_flags & PCB_NPXINITDONE) == 0) { + if ((pcb->pcb_flags & PCB_NPXINITDONE) == 0) { /* * This is the first time this thread has used the FPU or * the PCB doesn't contain a clean FPU state. Explicitly @@ -836,18 +815,54 @@ * npx_initialstate, to ignite the XSAVEOPT * tracking engine. */ - bcopy(npx_initialstate, curpcb->pcb_save, cpu_max_ext_state_size); - fpurstor(curpcb->pcb_save); - if (curpcb->pcb_initial_npxcw != __INITIAL_NPXCW__) - fldcw(curpcb->pcb_initial_npxcw); - curpcb->pcb_flags |= PCB_NPXINITDONE; - if (PCB_USER_FPU(curpcb)) - curpcb->pcb_flags |= PCB_NPXUSERINITDONE; + bcopy(npx_initialstate, pcb->pcb_save, cpu_max_ext_state_size); + fpurstor(pcb->pcb_save); + if (pcb->pcb_initial_npxcw != __INITIAL_NPXCW__) + fldcw(pcb->pcb_initial_npxcw); + pcb->pcb_flags |= PCB_NPXINITDONE; + if (PCB_USER_FPU(pcb)) + pcb->pcb_flags |= PCB_NPXUSERINITDONE; } else { - fpurstor(curpcb->pcb_save); + fpurstor(pcb->pcb_save); } +} + +/* + * Implement device not available (DNA) exception + * + * It would be better to switch FP context here (if curthread != fpcurthread) + * and not necessarily for every context switch, but it is too hard to + * access foreign pcb's. + */ +int +npxdna(void) +{ + struct thread *td; + + if (!hw_float) + return (0); + td = curthread; + critical_enter(); + if (__predict_false(PCPU_GET(fpcurthread) == td)) { + /* + * Some virtual machines seems to set %cr0.TS at + * arbitrary moments. Silently clear the TS bit + * regardless of the eager/lazy FPU context switch + * mode. + */ + stop_emulating(); + } else { + if (__predict_false(PCPU_GET(fpcurthread) != NULL)) { + printf( + "npxdna: fpcurthread = %p (%d), curthread = %p (%d)\n", + PCPU_GET(fpcurthread), + PCPU_GET(fpcurthread)->td_proc->p_pid, + td, td->td_proc->p_pid); + panic("npxdna"); + } + restore_npx_curthread(td, td->td_pcb); + } critical_exit(); - return (1); } @@ -869,10 +884,22 @@ xsaveopt((char *)addr, xsave_mask); else fpusave(addr); - start_emulating(); - PCPU_SET(fpcurthread, NULL); } +void npxswitch(struct thread *td, struct pcb *pcb); +void +npxswitch(struct thread *td, struct pcb *pcb) +{ + + if (lazy_fpu_switch || (td->td_pflags & TDP_KTHREAD) != 0 || + !PCB_USER_FPU(pcb)) { + start_emulating(); + PCPU_SET(fpcurthread, NULL); + } else if (PCPU_GET(fpcurthread) != td) { + restore_npx_curthread(td, pcb); + } +} + /* * Unconditionally save the current co-processor state across suspend and * resume.