Index: /branches/arm/lisp-kernel/arm-spentry.s
===================================================================
--- /branches/arm/lisp-kernel/arm-spentry.s	(revision 13826)
+++ /branches/arm/lisp-kernel/arm-spentry.s	(revision 13827)
@@ -496,8 +496,9 @@
 /* funcall nfn, returning multiple values if it does.  */
 _spentry(mvpass)
-	__(subs imm0,nargs,#node_size*nargregs)
-	__(movge imm0,#0)
-	__(add imm0,vsp,imm0)
-	__(build_lisp_frame(temp1,imm0))
+        __(cmp nargs,#node_size*nargregs)
+        __(mov imm1,vsp)
+	__(subgt imm1,imm1,#node_size*nargregs)
+	__(addgt imm1,imm1,nargs)
+	__(build_lisp_frame(imm0,imm1))
 	__(adr lr,C(ret1valn))
 	__(mov fn,#0)
@@ -884,48 +885,44 @@
 _spentry(set_hash_key)
 C(egc_set_hash_key):
-dnl         __(cmplr(cr2,arg_z,arg_x))
-dnl         __(la imm0,misc_data_offset(arg_y))
-dnl         __(str arg_z,arg_x,imm0)
-dnl         __(blelr cr2)
-dnl         __(add imm0,imm0,arg_x)
-dnl         __(ref_global(imm2,ref_base))
-dnl         __(load_highbit(imm3))
-dnl         __(ref_global(imm1,oldspace_dnode_count))
-dnl         __(sub imm0,imm0,imm2)
-dnl         __(srri(imm0,imm0,dnode_shift))       
-dnl         __(cmplr(imm0,imm1))
-dnl         __(extract_bit_shift_count(imm4,imm0))
-dnl         __(srri(imm0,imm0,bitmap_shift))       
-dnl         __(srr(imm3,imm3,imm4))
-dnl         __(ref_global(imm2,refbits))
-dnl         __(bgelr)
-dnl         __(slri(imm0,imm0,word_shift))
-dnl         __(ldrx(imm1,imm2,imm0))
-dnl         __(and. imm1,imm1,imm3)
-dnl         __(bne 2f)        
-dnl 1:      __(lrarx(imm1,imm2,imm0))
-dnl         __(or imm1,imm1,imm3)
-dnl         __(strcx(imm1,imm2,imm0))
-dnl         __(bne- 1b)
-dnl         __(isync)
-dnl 2:              
-dnl         __(ref_global(imm1,ref_base))
-dnl         __(sub imm0,arg_x,imm1)
-dnl         __(srri(imm0,imm0,dnode_shift))
-dnl         __(load_highbit(imm3))
-dnl         __(extract_bit_shift_count(imm4,imm0))
-dnl         __(srri(imm0,imm0,bitmap_shift))
-dnl         __(srr(imm3,imm3,imm4))
-dnl         __(slri(imm0,imm0,word_shift))
-dnl         __(ldrx(imm1,imm2,imm0))
-dnl         __(and. imm1,imm1,imm3)
-dnl         __(bnelr)
-dnl 3:      __(lrarx(imm1,imm2,imm0))
-dnl         __(or imm1,imm1,imm3)
-dnl         __(strcx(imm1,imm2,imm0))
-dnl         __(bne- 3b)
-dnl         __(isync)
-dnl         __(bx lr)
-
+        __(cmp arg_z,arg_x)
+	__(add imm0,arg_y,#misc_data_offset)
+	__(str arg_z,[arg_x,imm0])
+        __(bxhs lr)
+        __(add imm0,imm0,arg_x)
+        __(ref_global(temp0,ref_base))
+        __(sub imm0,imm0,temp0)
+        __(mov imm0,imm0,lsr #dnode_shift)
+        __(ref_global(imm1,oldspace_dnode_count))
+        __(cmp imm0,imm1)
+        __(bxhs lr)
+        __(and imm2,imm0,#31)
+        __(mov imm1,#0x80000000)
+        __(mov imm1,imm1,lsr imm2)
+        __(mov imm0,imm0,lsr #bitmap_shift)
+        __(ref_global(temp0,refbits))
+        __(add temp0,temp0,imm0,lsl #word_shift)
+0:      __(ldrex imm2,[temp0])
+        __(orr imm2,imm2,imm1)
+        __(strex imm0,imm2,[temp0])
+        __(cmp imm0,#0)
+        __(bne 0b)        
+/* Now need to ensure that the hash table itself is in the refmap; we
+   know that it's in bounds, etc. */
+        __(ref_global(temp0,ref_base))
+        __(sub imm0,arg_x,temp0)
+        __(mov imm0,imm0,lsr #dnode_shift)
+        __(and imm2,imm0,#31)
+        __(mov imm1,#0x80000000)
+        __(mov imm1,imm1,lsr imm2)
+        __(mov imm0,imm0,lsr #bitmap_shift)
+        __(ref_global(temp0,refbits))
+        __(add temp0,temp0,imm0,lsl #word_shift)
+1:      __(ldrex imm2,[temp0])
+        __(orr imm2,imm2,imm1)
+        __(strex imm0,imm2,[temp0])
+        __(cmp imm0,#0)
+        __(bne 1b)        
+        __(bx lr)
+        
 
 /*
@@ -964,5 +961,5 @@
         __(bne 1b)
         __(cmp arg_z,arg_x)
-        __(blo 4f)
+        __(bhi 4f)
 
         __(ref_global(imm0,ref_base))
@@ -981,4 +978,6 @@
         __(orr imm2,imm2,imm1)
         __(strex imm0,imm2,[temp0])
+        .globl C(egc_set_hash_key_conditional_test)
+C(egc_set_hash_key_conditional_test): 
         __(cmp imm0,#0)
         __(bne 2b)
@@ -988,54 +987,51 @@
     vsp`0' = (boxed) byte-offset 
     Interrupt-related issues are as in store_node_conditional, but
-l    we have to do more work to actually do the memoization.*/
+    we have to do more work to actually do the memoization.*/
 _spentry(set_hash_key_conditional)
         .globl C(egc_set_hash_key_conditional)
 C(egc_set_hash_key_conditional):
-dnl  __(cmplr(cr2,arg_z,arg_x))
-dnl  __(vpop(imm4))
-dnl  __(unbox_fixnum(imm4,imm4))
-dnl 1: __(lrarx(temp1,arg_x,imm4))
-dnl  __(cmpr(cr1,temp1,arg_y))
-dnl  __(bne cr1,5f)
-dnl  __(strcx(arg_z,arg_x,imm4))
-        .globl C(egc_set_hash_key_conditional_test)
-C(egc_set_hash_key_conditional_test): 
-dnl  __(bne 1b)
-dnl  __(isync)
-dnl  __(add imm0,imm4,arg_x)
-dnl  __(ref_global(imm2,ref_base))
-dnl  __(ref_global(imm1,oldspace_dnode_count))
-dnl  __(sub imm0,imm0,imm2)
-dnl  __(load_highbit(imm3))
-dnl  __(srri(imm0,imm0,dnode_shift))
-dnl  __(cmplr(imm0,imm1))
-dnl  __(extract_bit_shift_count(imm2,imm0))
-dnl  __(srri(imm0,imm0,bitmap_shift))
-dnl  __(srr(imm3,imm3,imm2))
-dnl  __(ref_global(imm2,refbits))
-dnl  __(bge 4f)
-dnl  __(slri(imm0,imm0,word_shift))
-dnl 2: __(lrarx(imm1,imm2,imm0))
-dnl  __(or imm1,imm1,imm3)
-dnl  __(strcx(imm1,imm2,imm0))
-dnl  __(bne- 2b)
-dnl  __(isync)
-dnl  /* Memoize hash table header */  
-dnl         __(ref_global(imm1,ref_base))
-dnl         __(sub imm0,arg_x,imm1)
-dnl         __(srri(imm0,imm0,dnode_shift))
-dnl         __(load_highbit(imm3))
-dnl         __(extract_bit_shift_count(imm4,imm0))
-dnl         __(srri(imm0,imm0,bitmap_shift))
-dnl         __(srr(imm3,imm3,imm4))
-dnl         __(slri(imm0,imm0,word_shift))
-dnl         __(ldrx(imm1,imm2,imm0))
-dnl         __(and. imm1,imm1,imm3)
-dnl         __(bne 4f)
-dnl 3:      __(lrarx(imm1,imm2,imm0))
-dnl         __(or imm1,imm1,imm3)
-dnl         __(strcx(imm1,imm2,imm0))
-dnl         __(bne- 3b)
-dnl         __(isync)
+        __(vpop1(imm1))
+        __(unbox_fixnum(imm1,imm1))
+0:      __(add imm2,arg_x,imm1)
+        __(ldrex temp1,[imm2])
+        __(cmp temp1,arg_y)
+        __(bne 5f)
+        __(strex imm0,arg_z,[imm2])
+        __(bne 0b)
+        __(cmp arg_z,arg_x)
+        __(bhi 4f)
+        __(ref_global(temp0,ref_base))
+        __(sub imm0,imm2,temp0)
+        __(mov imm0,imm0,lsr #dnode_shift)
+        __(ref_global(imm1,oldspace_dnode_count))
+        __(cmp imm0,imm1)
+        __(bhs 4f)
+        __(and imm2,imm0,#31)
+        __(mov imm1,#0x80000000)
+        __(mov imm1,imm1,lsr imm2)
+        __(mov imm0,imm0,lsr #bitmap_shift)
+        __(ref_global(temp0,refbits))
+        __(add temp0,temp0,imm0,lsl #word_shift)
+1:      __(ldrex imm2,[temp0])
+        __(orr imm2,imm2,imm1)
+        __(strex imm0,imm2,[temp0])
+        __(cmp imm0,#0)
+        __(bne 1b)        
+/* Now need to ensure that the hash table itself is in the refmap; we
+   know that it's in bounds, etc. */
+        __(ref_global(temp0,ref_base))
+        __(sub imm0,arg_x,temp0)
+        __(mov imm0,imm0,lsr #dnode_shift)
+        __(and imm2,imm0,#31)
+        __(mov imm1,#0x80000000)
+        __(mov imm1,imm1,lsr imm2)
+        __(mov imm0,imm0,lsr #bitmap_shift)
+        __(ref_global(temp0,refbits))
+        __(add temp0,temp0,imm0,lsl #word_shift)
+1:      __(ldrex imm2,[temp0])
+        __(orr imm2,imm2,imm1)
+        __(strex imm0,imm2,[temp0])
+        __(cmp imm0,#0)
+        __(bne 1b)        
 C(egc_write_barrier_end):
 4:      __(mov arg_z,#nil_value)
@@ -1922,7 +1918,16 @@
 
 
-
-_spentry(unused0)
-
+/* This doesn't need to memoize anything, but needs pc-lusering support
+   support because of the locative */
+_spentry(atomic_incf_node)
+        __(unbox_fixnum(imm0,arg_z))
+0:      __(add imm2,arg_y,imm0)
+        __(ldrex arg_z,[imm2])
+        __(add arg_z,arg_z,arg_x)
+        __(strex imm0,arg_z,[imm2])
+        __(cmp imm0,#0)
+        __(bne 0b)
+        __(bx lr)
+        
 _spentry(unused1)
 
@@ -2211,34 +2216,43 @@
 
 
-dnl 
-dnl         
-dnl /* Nargs is valid; all arg regs, lexpr-count pushed by caller.  */
-dnl /* imm0 = vsp to restore.  */
-dnl /* Return all values returned by caller to its caller, hiding  */
-dnl /* the variable-length arglist.  */
-dnl /* If we can detect that the caller's caller didn't expect  */
-dnl /* multiple values, then things are even simpler.  */
-_spentry(lexpr_entry)
-dnl  __(ref_global(imm1,ret1valn))
-dnl  __(cmpr(imm1,loc_pc))
-dnl  __(build_lisp_frame(fn,loc_pc,imm0))
-dnl  __(bne 1f)
-dnl  __(ref_global(imm0,lexpr_return))
-dnl  __(build_lisp_frame(rzero,imm0,vsp))
-dnl  __(mov loc_pc,imm1)
-dnl  __(ldr imm0,[rcontext,#tcr.cs_limit])
-dnl  __(trllt(sp,imm0))
-dnl  __(mov fn,#0)
-dnl  __(bx lr)
-dnl 
-dnl         /* The single-value case just needs to return to something that'll pop  */
-dnl         /* the variable-length frame off of the vstack.  */
-dnl 1:
-dnl  __(ref_global(loc_pc,lexpr_return1v))
-dnl  __(ldr imm0,[rcontext,#tcr.cs_limit])
-dnl  __(trllt(sp,imm0))
-dnl  __(mov fn,#0)
-dnl  __(bx lr)
-
+/* Divide the 64 bit unsigned integer in imm0 (low) and imm1 (high) by
+   the 32-bit unsigned integer in imm2; return the quotient in
+   imm0:imm1 and remainder in imm2.  We pretty much have to do this
+   as an ff call; even if we wrote the code ourselves, we'd have to
+   enter foreign context to use as many imm regs as we'd need.
+   Moral: don't do integer division on the ARM.
+*/
+        .globl C(__aeabi_uldivmod)        
+_spentry(udiv64by32)
+        __(cmp imm2,#0)
+        __(moveq arg_z,#XDIVZRO)
+        __(moveq nargs,#1<<fixnumshift)
+        __(beq _SPksignalerr)
+        __(stmdb vsp!,{arg_z,arg_y,arg_x,temp0,temp1,temp2})
+        __(str vsp,[rcontext,#tcr.save_vsp])
+        __(mov arg_z,rcontext)
+        __(ldr arg_y,[rcontext,#tcr.last_lisp_frame])
+        __(build_lisp_frame(r3))
+        __(str sp,[arg_z,#tcr.last_lisp_frame])
+        __(str allocptr,[arg_z,#tcr.save_allocptr])
+        __(mov r3,#TCR_STATE_FOREIGN)
+        __(str r3,[arg_z,#tcr.valence])
+        __(mov r3,#0)
+        __(bl C(__aeabi_uldivmod))
+        __(mov rcontext,arg_z)
+        __(str arg_y,[rcontext,#tcr.last_lisp_frame])
+        __(mov allocptr,#VOID_ALLOCPTR)
+        __(mov fn,#0)
+        __(mov temp2,#0)
+        __(mov temp1,#0)
+        __(mov temp0,#0)
+        __(mov arg_x,#TCR_STATE_LISP)
+        __(str arg_x,[rcontext,#tcr.valence])
+        __(ldr allocptr,[rcontext,#tcr.save_allocptr])
+        __(ldm vsp!,{arg_z,arg_y,arg_x,temp0,temp1,temp2})
+        __(ldr fn,[sp,#lisp_frame.savefn])
+        __(ldr lr,[sp,#lisp_frame.savelr])
+        __(discard_lisp_frame())
+        __(bx lr)
 
 
@@ -2398,8 +2412,8 @@
 _spentry(mvpasssym)
         __(cmp nargs,#node_size*nargregs)
-        __(mov nfn,vsp)
-        __(subgt nfn,nfn,#node_size*nargregs)
-        __(addgt nfn,nfn,nargs)
-        __(build_lisp_frame(imm0,nfn))
+        __(mov imm1,vsp)
+	__(subgt imm1,imm1,#node_size*nargregs)
+	__(addgt imm1,imm1,nargs)
+	__(build_lisp_frame(imm0,imm1))
         __(ref_global(lr,ret1val_addr,imm0))
         __(mov fn,#0)
@@ -2850,87 +2864,36 @@
         __(b local_label(error_exit))
 
-/* Most ARMs don't have hardware integer division.  This algorithm's
-  from Sloss, Symes, & Wright.  On entry: imm0 = numerator, imm1 = denominator;
-  on exit, imm0 = quotient, imm1 = remainder, imm2 clobbered.  Check for /0
-  here, so that callers don't have to.
-*/        
+        .globl C(__aeabi_uidivmod)                
 _spentry(udiv32)
-        __(cmp imm0,#0)
+        __(cmp imm1,#0)
         __(moveq arg_z,#XDIVZRO)
         __(moveq nargs,#1<<fixnumshift)
         __(beq _SPksignalerr)
-        __(ldr imm2,[rcontext,#tcr.flags])
-        __(orr imm2,imm2,#(1<<TCR_FLAG_BIT_ALLOCPTR_FOREIGN))
-        __(str imm2,[rcontext,#tcr.flags])
-        __(vpush1(rcontext))
-        /* Hopefully safe now to use r3 (rcontext) and r12 (allocptr)
-           as imm regs. */
-pushdef(`q',`r0')
-pushdef(`r',`r1')
-pushdef(`s',`r2')
-pushdef(`m',`r3')
-pushdef(`a',`r12')
-        __(clz s,q)
-        __(movs a,q,lsl s)
-        __(add a,pc,a,lsr #25)
-        __(ldrbeq a,[a,#local_label(t32)-local_label(b32)-64])
-local_label(b32):
-        __(subs s,s,#7)
-        __(rsb m,q,#0)
-        __(movpl q,a,lsl s)
-        __(mulpl a,q,m)
-        __(bmi local_label(udiv_by_large_d))
-        __(smlawt q,q,a,q)
-        __(teq m,m,asr #1)
-        __(mulne a,q,m)
-        __(movne s,#0)
-        __(smlalne s,q,a,q)
-        __(beq local_label(udiv_by_1))
-        __(umull s,q,r,q)
-        __(add r,r,m)
-        __(mla r,q,m,r)
-        __(cmn r,m)
-        __(subcs r,r,m)
-        __(addcc q,q,#1)
-        __(addpl r,r,m,lsl #1)
-        __(addpl q,q,#2)
-        __(b local_label(done))
-local_label(udiv_by_large_d):        
-        __(sub a,a,#4)
-        __(rsb s,s,#0)
-        __(mov q,a,lsr s)
-        __(umull s,q,r,q)
-        __(mla r,q,m,r)
-        __(cmn m,r,lsr #1)
-        __(addcs r,r,m,lsl #1)
-        __(addcs q,q,#2)
-        __(cmn m,r)
-        __(addcs q,q,#1)
-        __(b local_label(done))
-local_label(udiv_by_1):
-        __(mov q,r)
-        __(mov r,#0)
-local_label(done):   
-        __(mov allocptr,#-8)
-        __(vpop1(rcontext))
-        __(ldr imm2,[rcontext,tcr.flags])
-        __(bic imm2,imm2,#(1<<TCR_FLAG_BIT_ALLOCPTR_FOREIGN))
-        __(str imm2,[rcontext,tcr.flags])
-        __(ldr allocptr,[rcontext,tcr.save_allocptr])
-        __(bx lr)
-popdef(`s')
-popdef(`m')
-popdef(`a')
-popdef(`q')
-popdef(`r')
-local_label(t32):    
-        .byte 0xff,0xfc,0xf8,0xf4,0xf0,0xed,0xea,0xe6
-        .byte 0xe3,0xe0,0xdd,0xda,0xd7,0xd4,0xd2,0xcf
-        .byte 0xcc,0xca,0xc7,0xc5,0xc3,0xc0,0xbe,0xbc
-        .byte 0xba,0xb8,0xb6,0xb4,0xb2,0xb0,0xae,0xac
-        .byte 0xaa,0xa8,0xa7,0xa5,0xa3,0xa2,0xa0,0x9f
-        .byte 0x9d,0x9c,0x9a,0x99,0x97,0x96,0x94,0x93
-        .byte 0x92,0x90,0x8f,0x8e,0x8d,0x8c,0x8a,0x89
-        .byte 0x88,0x87,0x86,0x85,0x84,0x83,0x82,0x81
+        __(stmdb vsp!,{arg_z,arg_y,arg_x,temp0,temp1,temp2})
+        __(str vsp,[rcontext,#tcr.save_vsp])
+        __(mov arg_z,rcontext)
+        __(ldr arg_y,[rcontext,#tcr.last_lisp_frame])
+        __(build_lisp_frame(r3))
+        __(str sp,[arg_z,#tcr.last_lisp_frame])
+        __(str allocptr,[arg_z,#tcr.save_allocptr])
+        __(mov r3,#TCR_STATE_FOREIGN)
+        __(str r3,[arg_z,#tcr.valence])
+        __(mov r3,#0)
+        __(bl C(__aeabi_uidivmod))
+        __(mov rcontext,arg_z)
+        __(str arg_y,[rcontext,#tcr.last_lisp_frame])
+        __(mov allocptr,#VOID_ALLOCPTR)
+        __(mov fn,#0)
+        __(mov temp2,#0)
+        __(mov temp1,#0)
+        __(mov temp0,#0)
+        __(mov arg_x,#TCR_STATE_LISP)
+        __(str arg_x,[rcontext,#tcr.valence])
+        __(ldr allocptr,[rcontext,#tcr.save_allocptr])
+        __(ldm vsp!,{arg_z,arg_y,arg_x,temp0,temp1,temp2})
+        __(ldr fn,[sp,#lisp_frame.savefn])
+        __(ldr lr,[sp,#lisp_frame.savelr])
+        __(discard_lisp_frame())
+        __(bx lr)
 
 _spentry(sdiv32)
@@ -2957,4 +2920,5 @@
         __(ldr arg_y,[rcontext,#tcr.last_lisp_frame])
         __(stmdb vsp!,{arg_y,arg_x,temp0,temp1,temp2})
+        __(str vsp,[rcontext,#tcr.save_vsp])
 /* There's a u32 vector on top of the stack ; its first data word points
    to the previous stack object.  The 4 words at the bottom of the vector
