--- sys/amd64/amd64/cpu_switch.S.orig
+++ sys/amd64/amd64/cpu_switch.S
@@ -105,10 +105,10 @@
 
 	/* have we used fp, and need a save? */
 	cmpq	%rdi,PCPU(FPCURTHREAD)
-	jne	3f
+	jne	2f
 	movq	PCB_SAVEFPU(%r8),%r8
 	clts
-	cmpl	$0,use_xsave
+	cmpl	$0,use_xsave(%rip)
 	jne	1f
 	fxsave	(%r8)
 	jmp	2f
@@ -120,12 +120,7 @@
 	/* This is patched to xsaveopt if supported, see fpuinit_bsp1() */
 	xsave	(%r8)
 	movq	%rcx,%rdx
-2:	smsw	%ax
-	orb	$CR0_TS,%al
-	lmsw	%ax
-	xorl	%eax,%eax
-	movq	%rax,PCPU(FPCURTHREAD)
-3:
+2:
 	/* Save is done.  Now fire up new thread. Leave old vmspace. */
 	movq	%rsi,%r12
 	movq	%rdi,%r13
@@ -212,6 +207,8 @@
 	movq	PCB_RBX(%r8),%rbx
 	movq	PCB_RIP(%r8),%rax
 	movq	%rax,(%rsp)
+	movq	PCPU(CURTHREAD),%rdi
+	call	fpu_activate_sw
 	ret
 
 	/*
--- sys/amd64/amd64/fpu.c.orig
+++ sys/amd64/amd64/fpu.c
@@ -139,6 +139,11 @@
 SYSCTL_INT(_hw, HW_FLOATINGPT, floatingpoint, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 1, "Floating point instructions executed in hardware");
 
+int lazy_fpu_switch = 0;
+SYSCTL_INT(_hw, OID_AUTO, lazy_fpu_switch, CTLFLAG_RWTUN | CTLFLAG_NOFETCH,
+    &lazy_fpu_switch, 0,
+    "Lazily load FPU context after context switch");
+
 int use_xsave;			/* non-static for cpu_switch.S */
 uint64_t xsave_mask;		/* the same */
 static	uma_zone_t fpu_save_area_zone;
@@ -204,6 +209,7 @@
 	u_int cp[4];
 	uint64_t xsave_mask_user;
 
+	TUNABLE_INT_FETCH("hw.lazy_fpu_switch", &lazy_fpu_switch);
 	if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
 		use_xsave = 1;
 		TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
@@ -611,6 +617,45 @@
 	return (fpetable[(mxcsr & (~mxcsr >> 7)) & 0x3f]);
 }
 
+static void
+restore_fpu_curthread(struct thread *td)
+{
+	struct pcb *pcb;
+
+	/*
+	 * Record new context early in case frstor causes a trap.
+	 */
+	PCPU_SET(fpcurthread, td);
+
+	stop_emulating();
+	fpu_clean_state();
+	pcb = td->td_pcb;
+
+	if ((pcb->pcb_flags & PCB_FPUINITDONE) == 0) {
+		/*
+		 * This is the first time this thread has used the FPU or
+		 * the PCB doesn't contain a clean FPU state.  Explicitly
+		 * load an initial state.
+		 *
+		 * We prefer to restore the state from the actual save
+		 * area in PCB instead of directly loading from
+		 * fpu_initialstate, to ignite the XSAVEOPT
+		 * tracking engine.
+		 */
+		bcopy(fpu_initialstate, pcb->pcb_save,
+		    cpu_max_ext_state_size);
+		fpurestore(pcb->pcb_save);
+		if (pcb->pcb_initial_fpucw != __INITIAL_FPUCW__)
+			fldcw(pcb->pcb_initial_fpucw);
+		if (PCB_USER_FPU(pcb))
+			set_pcb_flags(pcb, PCB_FPUINITDONE |
+			    PCB_USERFPUINITDONE);
+		else
+			set_pcb_flags(pcb, PCB_FPUINITDONE);
+	} else
+		fpurestore(pcb->pcb_save);
+}
+
 /*
  * Device Not Available (DNA, #NM) exception handler.
  *
@@ -621,7 +666,9 @@
 void
 fpudna(void)
 {
+	struct thread *td;
 
+	td = curthread;
 	/*
 	 * This handler is entered with interrupts enabled, so context
 	 * switches may occur before critical_enter() is executed.  If
@@ -635,49 +682,38 @@
 
 	KASSERT((curpcb->pcb_flags & PCB_FPUNOSAVE) == 0,
 	    ("fpudna while in fpu_kern_enter(FPU_KERN_NOCTX)"));
-	if (PCPU_GET(fpcurthread) == curthread) {
-		printf("fpudna: fpcurthread == curthread\n");
+	if (__predict_false(PCPU_GET(fpcurthread) == td)) {
+		/*
+		 * Some virtual machines seems to set %cr0.TS at
+		 * arbitrary moments.  Silently clear the TS bit
+		 * regardless of the eager/lazy FPU context switch
+		 * mode.
+		 */
 		stop_emulating();
-		critical_exit();
-		return;
+	} else {
+		if (__predict_false(PCPU_GET(fpcurthread) != NULL)) {
+			panic(
+		    "fpudna: fpcurthread = %p (%d), curthread = %p (%d)\n",
+			    PCPU_GET(fpcurthread),
+			    PCPU_GET(fpcurthread)->td_tid, td, td->td_tid);
+		}
+		restore_fpu_curthread(td);
 	}
-	if (PCPU_GET(fpcurthread) != NULL) {
-		panic("fpudna: fpcurthread = %p (%d), curthread = %p (%d)\n",
-		    PCPU_GET(fpcurthread), PCPU_GET(fpcurthread)->td_tid,
-		    curthread, curthread->td_tid);
-	}
-	stop_emulating();
-	/*
-	 * Record new context early in case frstor causes a trap.
-	 */
-	PCPU_SET(fpcurthread, curthread);
+	critical_exit();
+}
 
-	fpu_clean_state();
+void fpu_activate_sw(struct thread *td); /* Called from the context switch */
+void
+fpu_activate_sw(struct thread *td)
+{
 
-	if ((curpcb->pcb_flags & PCB_FPUINITDONE) == 0) {
-		/*
-		 * This is the first time this thread has used the FPU or
-		 * the PCB doesn't contain a clean FPU state.  Explicitly
-		 * load an initial state.
-		 *
-		 * We prefer to restore the state from the actual save
-		 * area in PCB instead of directly loading from
-		 * fpu_initialstate, to ignite the XSAVEOPT
-		 * tracking engine.
-		 */
-		bcopy(fpu_initialstate, curpcb->pcb_save,
-		    cpu_max_ext_state_size);
-		fpurestore(curpcb->pcb_save);
-		if (curpcb->pcb_initial_fpucw != __INITIAL_FPUCW__)
-			fldcw(curpcb->pcb_initial_fpucw);
-		if (PCB_USER_FPU(curpcb))
-			set_pcb_flags(curpcb,
-			    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
-		else
-			set_pcb_flags(curpcb, PCB_FPUINITDONE);
-	} else
-		fpurestore(curpcb->pcb_save);
-	critical_exit();
+	if (lazy_fpu_switch || (td->td_pflags & TDP_KTHREAD) != 0 ||
+	    !PCB_USER_FPU(td->td_pcb)) {
+		PCPU_SET(fpcurthread, NULL);
+		start_emulating();
+	} else if (PCPU_GET(fpcurthread) != td) {
+		restore_fpu_curthread(td);
+	}
 }
 
 void
--- sys/i386/i386/swtch.s.orig
+++ sys/i386/i386/swtch.s
@@ -293,6 +293,12 @@
 cpu_switch_load_gs:
 	mov	PCB_GS(%edx),%gs
 
+	pushl	%edx
+	pushl	PCPU(CURTHREAD)
+	call	npxswitch
+	popl	%edx
+	popl	%edx
+
 	/* Test if debug registers should be restored. */
 	testl	$PCB_DBREGS,PCB_FLAGS(%edx)
 	jz      1f
--- sys/i386/isa/npx.c.orig
+++ sys/i386/isa/npx.c
@@ -191,6 +191,11 @@
 SYSCTL_INT(_hw, HW_FLOATINGPT, floatingpoint, CTLFLAG_RD,
     &hw_float, 0, "Floating point instructions executed in hardware");
 
+int lazy_fpu_switch = 0;
+SYSCTL_INT(_hw, OID_AUTO, lazy_fpu_switch, CTLFLAG_RWTUN | CTLFLAG_NOFETCH,
+    &lazy_fpu_switch, 0,
+    "Lazily load FPU context after context switch");
+
 int use_xsave;
 uint64_t xsave_mask;
 static	uma_zone_t fpu_save_area_zone;
@@ -327,6 +332,7 @@
 	u_int cp[4];
 	uint64_t xsave_mask_user;
 
+	TUNABLE_INT_FETCH("hw.lazy_fpu_switch", &lazy_fpu_switch);
 	if (cpu_fxsr && (cpu_feature2 & CPUID2_XSAVE) != 0) {
 		use_xsave = 1;
 		TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
@@ -785,47 +791,20 @@
 	return (fpetable[(mxcsr & (~mxcsr >> 7)) & 0x3f]);
 }
 
-/*
- * Implement device not available (DNA) exception
- *
- * It would be better to switch FP context here (if curthread != fpcurthread)
- * and not necessarily for every context switch, but it is too hard to
- * access foreign pcb's.
- */
-
-static int err_count = 0;
-
-int
-npxdna(void)
+static void
+restore_npx_curthread(struct thread *td, struct pcb *pcb)
 {
 
-	if (!hw_float)
-		return (0);
-	critical_enter();
-	if (PCPU_GET(fpcurthread) == curthread) {
-		printf("npxdna: fpcurthread == curthread %d times\n",
-		    ++err_count);
-		stop_emulating();
-		critical_exit();
-		return (1);
-	}
-	if (PCPU_GET(fpcurthread) != NULL) {
-		printf("npxdna: fpcurthread = %p (%d), curthread = %p (%d)\n",
-		       PCPU_GET(fpcurthread),
-		       PCPU_GET(fpcurthread)->td_proc->p_pid,
-		       curthread, curthread->td_proc->p_pid);
-		panic("npxdna");
-	}
-	stop_emulating();
 	/*
 	 * Record new context early in case frstor causes a trap.
 	 */
-	PCPU_SET(fpcurthread, curthread);
+	PCPU_SET(fpcurthread, td);
 
+	stop_emulating();
 	if (cpu_fxsr)
 		fpu_clean_state();
 
-	if ((curpcb->pcb_flags & PCB_NPXINITDONE) == 0) {
+	if ((pcb->pcb_flags & PCB_NPXINITDONE) == 0) {
 		/*
 		 * This is the first time this thread has used the FPU or
 		 * the PCB doesn't contain a clean FPU state.  Explicitly
@@ -836,18 +815,54 @@
 		 * npx_initialstate, to ignite the XSAVEOPT
 		 * tracking engine.
 		 */
-		bcopy(npx_initialstate, curpcb->pcb_save, cpu_max_ext_state_size);
-		fpurstor(curpcb->pcb_save);
-		if (curpcb->pcb_initial_npxcw != __INITIAL_NPXCW__)
-			fldcw(curpcb->pcb_initial_npxcw);
-		curpcb->pcb_flags |= PCB_NPXINITDONE;
-		if (PCB_USER_FPU(curpcb))
-			curpcb->pcb_flags |= PCB_NPXUSERINITDONE;
+		bcopy(npx_initialstate, pcb->pcb_save, cpu_max_ext_state_size);
+		fpurstor(pcb->pcb_save);
+		if (pcb->pcb_initial_npxcw != __INITIAL_NPXCW__)
+			fldcw(pcb->pcb_initial_npxcw);
+		pcb->pcb_flags |= PCB_NPXINITDONE;
+		if (PCB_USER_FPU(pcb))
+			pcb->pcb_flags |= PCB_NPXUSERINITDONE;
 	} else {
-		fpurstor(curpcb->pcb_save);
+		fpurstor(pcb->pcb_save);
 	}
+}
+
+/*
+ * Implement device not available (DNA) exception
+ *
+ * It would be better to switch FP context here (if curthread != fpcurthread)
+ * and not necessarily for every context switch, but it is too hard to
+ * access foreign pcb's.
+ */
+int
+npxdna(void)
+{
+	struct thread *td;
+
+	if (!hw_float)
+		return (0);
+	td = curthread;
+	critical_enter();
+	if (__predict_false(PCPU_GET(fpcurthread) == td)) {
+		/*
+		 * Some virtual machines seems to set %cr0.TS at
+		 * arbitrary moments.  Silently clear the TS bit
+		 * regardless of the eager/lazy FPU context switch
+		 * mode.
+		 */
+		stop_emulating();
+	} else {
+		if (__predict_false(PCPU_GET(fpcurthread) != NULL)) {
+			printf(
+		    "npxdna: fpcurthread = %p (%d), curthread = %p (%d)\n",
+			    PCPU_GET(fpcurthread),
+			    PCPU_GET(fpcurthread)->td_proc->p_pid,
+			    td, td->td_proc->p_pid);
+			panic("npxdna");
+		}
+		restore_npx_curthread(td, td->td_pcb);
+	}
 	critical_exit();
-
 	return (1);
 }
 
@@ -869,10 +884,22 @@
 		xsaveopt((char *)addr, xsave_mask);
 	else
 		fpusave(addr);
-	start_emulating();
-	PCPU_SET(fpcurthread, NULL);
 }
 
+void npxswitch(struct thread *td, struct pcb *pcb);
+void
+npxswitch(struct thread *td, struct pcb *pcb)
+{
+
+	if (lazy_fpu_switch || (td->td_pflags & TDP_KTHREAD) != 0 ||
+	    !PCB_USER_FPU(pcb)) {
+		start_emulating();
+		PCPU_SET(fpcurthread, NULL);
+	} else if (PCPU_GET(fpcurthread) != td) {
+		restore_npx_curthread(td, pcb);
+	}
+}
+
 /*
  * Unconditionally save the current co-processor state across suspend and
  * resume.