/* * xmm_power.c * xmmLibm * * Created by Ian Ollmann, Ph.D. on 7/15/05. * Copyright © 2005 Apple Computer, Inc. All rights reserved. * * Constants from original typing by Earl Killian of MIPS on March 23rd, 1992. * Converted from pairs of 32-bit hexadecimal words to C99 notation, by Ian Ollmann * * Algorithm from Peter Tang: * * ACM Transactions on Mathematical Software, Vol 15, 2, June 1989, pp 144-157 * ACM Transactions on Mathematical Software, Vol 18, 2, June 1992, pp 211-222 * */ #include "xmmLibm_prefix.h" //#include "math.h" static const double T1 = 0x1.F800000000000p-1; // 0.984375 // T1 = 1-1/64 static const double T2 = 0x1.0400000000000p+0; // 1.015625 // T2 = 1+1/64 static const double T52 = 0x1.0000000000000p+52; // 4503599627370496 // T52 = 2^52 static const double T53 = 0x1.0000000000000p+53; // 9007199254740992 // T53 = 2^53 static const double Emax = 0x1.62E42FEFA39EFp+9; //709.782712893383973096 static const double Emin = -0x1.74385446D71C3p+9; //-744.44007192138121809 static const double A1 = 0x1.5555555555555p-4; //0.0833333333333333287074 static const double A2 = 0x1.99999999FE736p-7; //0.0125000000007165872062 static const double A3 = 0x1.24919E6600596p-9; //0.00223212297987691443008 static const double B1 = 0x1.555555554F9EDp-4; //0.0833333333330085884727 static const double B2 = 0x1.999A118F10BD9p-7; //0.0125000558601921375051 static const double C1 = 0x1.5555555555555p-4; //0.0833333333333333287074 static const double C2 = 0x1.99999999AF7C8p-7; //0.0125000000001555117146 static const double C3 = 0x1.24923B2F1315Cp-9; //0.00223214123210461849733 static const double C4 = 0x1.CE20EE795DBA1p-12; //0.00044072021372392784647 static const double C5 = 0x1.CE20EE795DBA1p-12; //0.00044072021372392784647 static const double Inv_L = 0x1.71547652B82FEp+5; //46.1662413084468283841 static const double L1 = 0x1.62E42FEF00000p-6; //0.0216608493901730980724 static const double L2 = 0x1.473DE6AF278EDp-39; //2.32519284687887401481e-12 static const double P1 = 0x1.0000000000000p-1; // 0.5 static const double P2 = 0x1.5555555547334p-3; //0.166666666665059914898 static const double P3 = 0x1.555555554A9D2p-5; //0.0416666666663619983391 static const double P4 = 0x1.1111609FBE568p-7; //0.00833337038010269204324 static const double P5 = 0x1.6C172098CB78Bp-10; //0.0013888944287816253325 static const double plusinf = 1e500; // inf static const double logtable[129*2] = { 0.0, // 0 0.0, // 0 0x1.FE02A6B200000p-8, //0.00778214044294145423919 -0x1.F30EE07912DF9p-41, //-8.86505291726724739329e-13 0x1.FC0A8B1000000p-7, //0.015504186536418274045 -0x1.FE0E183092C59p-42, //-4.5301989413649348858e-13 0x1.7B91B07D80000p-6, //0.0231670592820591991767 -0x1.2772AB6C0559Cp-41, //-5.24820947929564425056e-13 0x1.F829B0E780000p-6, //0.0307716586667083902285 0x1.980267C7E09E4p-45, //4.52981425779092882775e-14 0x1.39E87BA000000p-5, //0.0383188643027096986771 -0x1.42A056FEA4DFDp-41, //-5.7309948330766312415e-13 0x1.77458F6340000p-5, //0.0458095360318111488596 -0x1.2303B9CB0D5E1p-41, //-5.16945692881222029089e-13 0x1.B42DD71180000p-5, //0.053244514518155483529 0x1.71BEC28D14C7Ep-41, //6.56799336898521766515e-13 0x1.F0A30C0100000p-5, //0.0606246218158048577607 0x1.62A6617CC9717p-41, //6.29984819938331143924e-13 0x1.16536EEA40000p-4, //0.0679506619089806918055 -0x1.0A3E2F3B47D18p-41, //-4.72942410916632905482e-13 0x1.341D7961C0000p-4, //0.0752234212377516087145 -0x1.717B6B33E44F8p-43, //-1.64083015855986619259e-13 0x1.51B073F060000p-4, //0.0824436692109884461388 0x1.83F69278E686Ap-44, //8.61451293608781447223e-14 0x1.6F0D28AE60000p-4, //0.089612158690215437673 -0x1.2968C836CC8C2p-41, //-5.28305053080814367877e-13 0x1.8C345D6320000p-4, //0.0967296264589094789699 -0x1.937C294D2F567p-42, //-3.58366674300941370128e-13 0x1.A926D3A4A0000p-4, //0.103796793680885457434 0x1.AAC6CA17A4554p-41, //7.58107392301637643358e-13 0x1.C5E548F5C0000p-4, //0.110814366340491687879 -0x1.C5E7514F4083Fp-43, //-2.01573684160162150917e-13 0x1.E27076E2A0000p-4, //0.117783035655520507134 0x1.E5CBD3D50FFFCp-41, //8.62947404296943765415e-13 0x1.FEC9131DC0000p-4, //0.12470347850103280507 -0x1.54555D1AE6607p-44, //-7.55692068745133691756e-14 0x1.0D77E7CD10000p-3, //0.131576357789526809938 -0x1.C69A65A23A170p-41, //-8.075373495358435282e-13 0x1.1B72AD52F0000p-3, //0.138402322858382831328 0x1.9E80A41811A39p-41, //7.36304357708705134617e-13 0x1.29552F8200000p-3, //0.145182009844575077295 -0x1.5B967F4471DFCp-44, //-7.71800133682809851086e-14 0x1.371FC201F0000p-3, //0.151916042026641662233 -0x1.C22F10C9A4EA8p-41, //-7.99687160774375808217e-13 0x1.44D2B6CCB0000p-3, //0.158605030175749561749 0x1.F4799F4F6543Ep-41, //8.890223439724662699e-13 0x1.526E5E3A20000p-3, //0.165249572895845631137 -0x1.2F21746FF8A47p-41, //-5.38468261878823239516e-13 0x1.5FF3070A80000p-3, //0.171850256927427835763 -0x1.B0B0DE3077D7Ep-41, //-7.6861342240181686233e-13 0x1.6D60FE71A0000p-3, //0.178407657473144354299 -0x1.6F1B955C4D1DAp-42, //-3.26057179310581566492e-13 0x1.7AB8902110000p-3, //0.184922338494288851507 -0x1.37B720E4A694Bp-42, //-2.76858843104483055456e-13 0x1.87FA065210000p-3, //0.191394853000019793399 -0x1.B77B7EFFB7F41p-42, //-3.90338789379495198219e-13 0x1.9525A9CF40000p-3, //0.197825743329303804785 0x1.5AD1D904C1D4Ep-41, //6.16075577558872326221e-13 0x1.A23BC1FE30000p-3, //0.20421554142922104802 -0x1.2A739B23B93E1p-41, //-5.30156516006025969342e-13 0x1.AF3C94E810000p-3, //0.210564769107804750092 -0x1.00349CC67F9B2p-41, //-4.55112422774782019696e-13 0x1.BC286742E0000p-3, //0.216873938301432644948 -0x1.CCA75818C5DBCp-41, //-8.18285329273778346373e-13 0x1.C8FF7C79B0000p-3, //0.223143551314933574758 -0x1.97794F689F843p-41, //-7.23818992174968106057e-13 0x1.D5C216B500000p-3, //0.229374101065332069993 -0x1.11BA91BBCA682p-41, //-4.86240001538378988236e-13 0x1.E27076E2B0000p-3, //0.235566071312860003673 -0x1.A342C2AF0003Cp-44, //-9.30945949519688945136e-14 0x1.EF0ADCBDC0000p-3, //0.241719936886511277407 0x1.64D948637950Ep-41, //6.33890736899755317832e-13 0x1.FB9186D5E0000p-3, //0.247836163904139539227 0x1.F1546AAA3361Cp-42, //4.41717553713155466566e-13 0x1.0402594B50000p-2, //0.253915209981641964987 -0x1.7DF928EC217A5p-41, //-6.78520849597058843023e-13 0x1.0A324E2738000p-2, //0.259957524436686071567 0x1.0E35F73F7A018p-42, //2.39995404842117353465e-13 0x1.1058BF9AE8000p-2, //0.265963548497893498279 -0x1.A9573B02FAA5Ap-41, //-7.55556940028374187478e-13 0x1.1675CABAB8000p-2, //0.271933715483100968413 0x1.30701CE63EAB9p-41, //5.40790418614551497411e-13 0x1.1C898C1698000p-2, //0.277868451003087102436 0x1.9FAFBC68E7540p-42, //3.69203750820800887027e-13 0x1.22941FBCF8000p-2, //0.283768173130738432519 -0x1.A6976F5EB0963p-44, //-9.38341722366369999987e-14 0x1.2895A13DE8000p-2, //0.289633292582948342897 0x1.A8D7AD24C13F0p-44, //9.43339818951269030846e-14 0x1.2E8E2BAE10000p-2, //0.2954642128934210632 0x1.D309C2CC91A85p-42, //4.1481318704258567983e-13 0x1.347DD9A988000p-2, //0.301261330578199704178 -0x1.5594DD4C58092p-45, //-3.7923164802093146798e-14 0x1.3A64C55698000p-2, //0.307025035295737325214 -0x1.D0B1C68651946p-41, //-8.25463138725003992339e-13 0x1.4043086868000p-2, //0.312755710003330023028 0x1.3F1DE86093EFAp-41, //5.66865358290073900922e-13 0x1.4618BC21C8000p-2, //0.318453731119006988592 -0x1.09EC17A426426p-41, //-4.72372782198636743149e-13 0x1.4BE5F95778000p-2, //0.324119468654316733591 -0x1.D7C92CD9AD824p-44, //-1.04757500587765412913e-13 0x1.51AAD872E0000p-2, //0.329753286372579168528 -0x1.F4BD8DB0A7CC1p-44, //-1.11186713895593226425e-13 0x1.5767717458000p-2, //0.335355541921671829186 -0x1.2C9D5B2A49AF9p-41, //-5.33998929200329680741e-13 0x1.5D1BDBF580000p-2, //0.340926586970454081893 0x1.394A11B1C1EE4p-43, //1.39128412121975659358e-13 0x1.62C82F2BA0000p-2, //0.34646676734701031819 -0x1.C356848506EADp-41, //-8.01737271397201833369e-13 0x1.686C81E9B0000p-2, //0.351976423156884266064 0x1.4AEC442BE1015p-42, //2.9391859187648000773e-13 0x1.6E08EAA2B8000p-2, //0.357455888921322184615 0x1.0F1C609C98C6Cp-41, //4.81589611172320539489e-13 0x1.739D7F6BC0000p-2, //0.362905493690050207078 -0x1.7FCB18ED9D603p-41, //-6.81753940632532726416e-13 0x1.792A55FDD8000p-2, //0.368325561159508652054 -0x1.C2EC1F512DC03p-41, //-8.00999005543249141653e-13 0x1.7EAF83B828000p-2, //0.373716409792905324139 0x1.7E1B259D2F3DAp-41, //6.78756682315870616582e-13 0x1.842D1DA1E8000p-2, //0.379078352934811846353 0x1.62E927628CBC2p-43, //1.57612037739694350287e-13 0x1.89A3386C18000p-2, //0.384411698911208077334 -0x1.ED2A52C73BF78p-41, //-8.76037599077487426799e-13 0x1.8F11E87368000p-2, //0.389716751140440464951 -0x1.D3881E8962A96p-42, //-4.15251580634361213004e-13 0x1.947941C210000p-2, //0.394993808240542421117 0x1.6FABA4CDD147Dp-42, //3.26556988969071456956e-13 0x1.99D9581180000p-2, //0.400243164127459749579 -0x1.F753456D113B8p-42, //-4.47042650104524445082e-13 0x1.9F323ECBF8000p-2, //0.405465108107819105498 0x1.84BF2B68D766Fp-42, //3.45276479520397708744e-13 0x1.A484090E58000p-2, //0.410659924984429380856 0x1.D8515FE535B87p-41, //8.39005077851830734139e-13 0x1.A9CEC9A9A0000p-2, //0.415827895143593195826 0x1.0931A909FEA5Ep-43, //1.17769787513692141889e-13 0x1.AF12932478000p-2, //0.420969294644237379543 -0x1.E53BB31EED7A9p-44, //-1.07743414616095792458e-13 0x1.B44F77BCC8000p-2, //0.426084395310681429692 0x1.EC5197DDB55D3p-43, //2.1863343293215910319e-13 0x1.B985896930000p-2, //0.431173464818130014464 0x1.0FB598FB14F89p-42, //2.41326394913331314894e-13 0x1.BEB4D9DA70000p-2, //0.436236766774527495727 0x1.B7BF7861D37ACp-42, //3.90574622098307022265e-13 0x1.C3DD7A7CD8000p-2, //0.44127456080423144158 0x1.6A6B9D9E0A5BDp-41, //6.43787909737320689684e-13 0x1.C8FF7C79A8000p-2, //0.446287102628048160113 0x1.A21AC25D81EF3p-42, //3.71351419195920213229e-13 0x1.CE1AF0B860000p-2, //0.451274644139630254358 -0x1.8290905A86AA6p-43, //-1.71669213360824320344e-13 0x1.D32FE7E010000p-2, //0.456237433481874177232 -0x1.42A9E21373414p-42, //-2.86582851579143533167e-13 0x1.D83E7258A0000p-2, //0.461175715121498797089 0x1.79F2828ADD176p-41, //6.71369279138460114513e-13 0x1.DD46A04C20000p-2, //0.466089729925442952663 -0x1.DAFA08CECADB1p-41, //-8.43728104087127572675e-13 0x1.E24881A7C8000p-2, //0.470979715219073113985 -0x1.3D9E34270BA6Bp-42, //-2.82101438461812689978e-13 0x1.E744261D68000p-2, //0.475845904869856894948 0x1.E1F8DF68DBCF3p-44, //1.07019317621142549209e-13 0x1.EC399D2468000p-2, //0.480688529345570714213 0x1.9802EB9DCA7E7p-43, //1.81193463664411114729e-13 0x1.F128F5FAF0000p-2, //0.485507815781602403149 0x1.BB2CD720EC44Cp-44, //9.84046527823262695501e-14 0x1.F6123FA700000p-2, //0.490303988044615834951 0x1.45630A2B61E5Bp-41, //5.78003198945402769376e-13 0x1.FAF588F790000p-2, //0.495077266798034543172 -0x1.9C24CA098362Bp-43, //-1.83028573560416684376e-13 0x1.FFD2E08580000p-2, //0.499827869556611403823 -0x1.6CF54D05F9367p-43, //-1.62074001567449500378e-13 0x1.02552A5A5C000p-1, //0.504556010751912253909 0x1.0FEC69C695D7Fp-41, //4.830331494955320223e-13 0x1.04BDF9DA94000p-1, //0.509261901790523552336 -0x1.92D9A033EFF75p-41, //-7.15605531723821174215e-13 0x1.0723E5C1CC000p-1, //0.513945751101346104406 0x1.F404E57963891p-41, //8.8821239518571854472e-13 0x1.0986F4F574000p-1, //0.518607764208354637958 -0x1.5BE8DC04AD601p-42, //-3.09005805132382426227e-13 0x1.0BE72E4254000p-1, //0.523248143765158602037 -0x1.57D49676844CCp-41, //-6.10765519728514951184e-13 0x1.0E44985D1C000p-1, //0.527867089620485785417 0x1.917EDD5CBBD2Dp-42, //3.56599696633478298092e-13 0x1.109F39E2D4000p-1, //0.532464798869114019908 0x1.92DFBC7D93617p-42, //3.57823965912763837621e-13 0x1.12F7195940000p-1, //0.537041465897345915437 -0x1.043ACFEDCE638p-41, //-4.62260870015445768969e-13 0x1.154C3D2F4C000p-1, //0.541597282432121573947 0x1.5E9A98F33A396p-41, //6.22797629172251525649e-13 0x1.179EABBD88000p-1, //0.54613243759740726091 0x1.9A0BFC60E6FA0p-41, //7.28389472720657362987e-13 0x1.19EE6B467C000p-1, //0.550647117952394182794 0x1.2DD98B97BAEF0p-42, //2.68096466152116723636e-13 0x1.1C3B81F714000p-1, //0.555141507540611200966 -0x1.EDA1B58389902p-44, //-1.09608250460592783688e-13 0x1.1E85F5E704000p-1, //0.559615787935399566777 0x1.A07BD8B34BE7Cp-46, //2.3119493838005377632e-14 0x1.20CDCD192C000p-1, //0.564070138285387656651 -0x1.4926CAFC2F08Ap-41, //-5.84690580052992446699e-13 0x1.23130D7BEC000p-1, //0.568504735352689749561 -0x1.7AFA4392F1BA7p-46, //-2.10374825114449422873e-14 0x1.2555BCE990000p-1, //0.572919753562018740922 -0x1.06987F78A4A5Ep-42, //-2.33231829455874083248e-13 0x1.2795E1289C000p-1, //0.577315365035246941261 -0x1.DCA290F81848Dp-42, //-4.23336942881419153586e-13 0x1.29D37FEC2C000p-1, //0.5816917396350618219 -0x1.EEA6F465268B4p-42, //-4.39339379697378426136e-13 0x1.2C0E9ED448000p-1, //0.586049045003164792433 0x1.D1772F5386374p-42, //4.13416470738355643357e-13 0x1.2E47436E40000p-1, //0.590387446602107957006 0x1.34202A10C3491p-44, //6.84176364159146659095e-14 0x1.307D7334F0000p-1, //0.594707107746216934174 0x1.0BE1FB590A1F5p-41, //4.75855340044306376333e-13 0x1.32B1339120000p-1, //0.599008189645246602595 0x1.D71320556B67Bp-41, //8.36796786747576938145e-13 0x1.34E289D9D0000p-1, //0.603290851438941899687 -0x1.E2CE9146D277Ap-41, //-8.57637346466586390378e-13 0x1.37117B5474000p-1, //0.607555250224322662689 0x1.ED71774092113p-43, //2.19132812293400917731e-13 0x1.393E0D3564000p-1, //0.611801541106615331955 -0x1.5E6563BBD9FC9p-41, //-6.22428425364311469584e-13 0x1.3B6844A000000p-1, //0.616029877215623855591 -0x1.EEA838909F3D3p-44, //-1.0983594325438429833e-13 0x1.3D9026A714000p-1, //0.620240409751204424538 0x1.6FAA404263D0Bp-41, //6.53104313776336534177e-13 0x1.3FB5B84D18000p-1, //0.624433288012369303033 -0x1.0BDA4B162AFA3p-41, //-4.75801990217107657695e-13 0x1.41D8FE8468000p-1, //0.628608659422752680257 -0x1.AA33736867A17p-42, //-3.78542512654570397313e-13 0x1.43F9FE2F9C000p-1, //0.632766669570628437214 0x1.CCEF4E4F736C2p-42, //4.09392332186786656392e-13 0x1.4618BC21C4000p-1, //0.636907462236194987781 0x1.EC27D0B7B37B3p-41, //8.74243839148582888557e-13 0x1.48353D1EA8000p-1, //0.641031179420679109171 0x1.1BEE7ABD17660p-42, //2.52181884568428814231e-13 0x1.4A4F85DB04000p-1, //0.645137961373620782979 -0x1.44FDD840B8591p-45, //-3.60813136042255739798e-14 0x1.4C679AFCD0000p-1, //0.64922794662561500445 -0x1.1C64E971322CEp-41, //-5.05185559242808977623e-13 0x1.4E7D811B74000p-1, //0.653301272011958644725 0x1.BB09CB0985646p-41, //7.86994033233553225797e-13 0x1.50913CC018000p-1, //0.657358072709030238912 -0x1.794B434C5A4F5p-41, //-6.70208769619490643267e-13 0x1.52A2D265BC000p-1, //0.661398482245203922503 0x1.6ABB9DF22BC57p-43, //1.61085757539324585156e-13 0x1.54B2467998000p-1, //0.665422632544505177066 0x1.497A915428B44p-41, //5.85271884362515112697e-13 0x1.56BF9D5B40000p-1, //0.669430653942981734872 -0x1.8CD7DC73BD194p-42, //-3.52467572979047943315e-13 0x1.58CADB5CD8000p-1, //0.673422675212350441143 -0x1.9DB3DB43689B4p-43, //-1.83720844956290584735e-13 0x1.5AD404C358000p-1, //0.677398823590920073912 0x1.F2CFB29AAA5F0p-41, //8.86066898134949155668e-13 0x1.5CDB1DC6C0000p-1, //0.681359224807238206267 0x1.7648CF6E3C5D7p-41, //6.64862680714687006264e-13 0x1.5EE02A9240000p-1, //0.685304003098281100392 0x1.67570D6095FD2p-41, //6.38316151706465171657e-13 0x1.60E32F4478000p-1, //0.689233281238557538018 0x1.1B194F912B417p-42, //2.51442307283760746611e-13 0x1.62E42FEFA4000p-1, //0.693147180560117703862 -0x1.8432A1B0E2634p-43 //-1.72394445256148347926e-13 }; static const double exptable[32*2] = { 0x1.0000000000000p+0, // 1 0.0, // 0 0x1.059B0D3158540p+0, //1.02189714865410508082 0x1.A1D73E2A475B4p-47, //1.15974117063913618369e-14 0x1.0B5586CF98900p+0, //1.04427378242741042413 0x1.EC5317256E308p-49, //3.41618797093084913461e-15 0x1.11301D0125B40p+0, //1.06714040067681992241 0x1.0A4EBBF1AED93p-48, //3.69575974405711634226e-15 0x1.172B83C7D5140p+0, //1.09050773266524458904 0x1.D6E6FBE462876p-47, //1.30701638697787231928e-14 0x1.1D4873168B980p+0, //1.11438674259588310633 0x1.53C02DC0144C8p-47, //9.42997619141976990955e-15 0x1.2387A6E756200p+0, //1.13878863475667913008 0x1.C3360FD6D8E0Bp-47, //1.25236260025620074908e-14 0x1.29E9DF51FDEC0p+0, //1.16372485877757014805 0x1.09612E8AFAD12p-47, //7.36576401089527357793e-15 0x1.306FE0A31B700p+0, //1.18920711500271636396 0x1.52DE8D5A46306p-48, //4.70275685574031410345e-15 0x1.371A7373AA9C0p+0, //1.21524735998046651275 0x1.54E28AA05E8A9p-49, //2.36536434724852953227e-15 0x1.3DEA64C123400p+0, //1.2418578120734764525 0x1.11ADA0911F09Fp-47, //7.59609684336943426188e-15 0x1.44E0860618900p+0, //1.26905095719172322788 0x1.68189B7A04EF8p-47, //9.99467515375775096979e-15 0x1.4BFDAD5362A00p+0, //1.29683955465100098081 0x1.38EA1CBD7F621p-47, //8.68512209487110863385e-15 0x1.5342B569D4F80p+0, //1.32523664315974087913 0x1.DF0A83C49D86Ap-52, //4.15501897749673983948e-16 0x1.5AB07DD485400p+0, //1.35425554693688354746 0x1.4AC64980A8C8Fp-47, //9.18083828572431297142e-15 0x1.6247EB03A5580p+0, //1.38390988196383091235 0x1.2C7C3E81BF4B7p-50, //1.04251790803720876383e-15 0x1.6A09E667F3BC0p+0, //1.41421356237309225889 0x1.921165F626CDDp-49, //2.78990693089087774733e-15 0x1.71F75E8EC5F40p+0, //1.44518080697703510396 0x1.9EE91B8797785p-47, //1.15160818747516875124e-14 0x1.7A11473EB0180p+0, //1.47682614593949779191 0x1.B5F54408FDB37p-50, //1.51947228890629129108e-15 0x1.82589994CCE00p+0, //1.50916442759341862256 0x1.28ACF88AFAB35p-48, //4.11720196080016564552e-15 0x1.8ACE5422AA0C0p+0, //1.54221082540793474891 0x1.B5BA7C55A192Dp-48, //6.074702681072821835e-15 0x1.93737B0CDC5C0p+0, //1.57598084510787828094 0x1.27A280E1F92A0p-47, //8.20551346575487959595e-15 0x1.9C49182A3F080p+0, //1.61049033194925073076 0x1.01C7C46B071F3p-48, //3.57742087137029902059e-15 0x1.A5503B23E2540p+0, //1.64575547815395850648 0x1.C8B424491CAF8p-48, //6.33803674368915982631e-15 0x1.AE89F995AD380p+0, //1.68179283050741901206 0x1.6AF439A68BB99p-47, //1.00739973218322238167e-14 0x1.B7F76F2FB5E40p+0, //1.71861929812247637983 0x1.BAA9EC206AD4Fp-50, //1.5357984302925880313e-15 0x1.C199BDD855280p+0, //1.75625216037329323626 0x1.C2220CB12A092p-48, //6.24685034485536557515e-15 0x1.CB720DCEF9040p+0, //1.79470907500309806437 0x1.48A81E5E8F4A5p-47, //9.12205626035419583226e-15 0x1.D5818DCFBA480p+0, //1.83400808640934087634 0x1.C976816BAD9B8p-50, //1.58714330671767538549e-15 0x1.DFC97337B9B40p+0, //1.87416763411029307917 0x1.EB968CAC39ED3p-48, //6.82215511854592947014e-15 0x1.EA4AFA2A490C0p+0, //1.91520656139714162691 0x1.9858F73A18F5Ep-48, //5.66696026748885461802e-15 0x1.F50765B6E4540p+0, //1.95714412417540017941 0x1.9D3E12DD8A18Bp-54 //8.9607677910366677676e-17 }; static const double one = 1.0; static const double mOne = -1.0; static const double two = 2.0; static const xSInt64 bias[2] = { {2045, 0}, {1023, 0} }; static const xUInt64 POWER_NAN = { 0x7FF8000000000025ULL, 0 }; //should be the same as nan("37") static inline xDouble ipow( xDouble x, uint32_t u ) ALWAYS_INLINE; static inline xDouble _pow( xDouble X, xDouble Y ) ALWAYS_INLINE; static inline xDouble xscalb( xDouble x, int M ) { static const double scale[2] = { 0x1.0p1022, 0x1.0p-1022 }; static const int step[2] = { 1022, -1022 }; int index = M >> 31; int count = abs(M); xSInt64 xm = _mm_cvtsi32_si128( M + 1023 ); xSInt64 xstep = _mm_cvtsi32_si128( step[-index] ); xDouble xscale = _mm_load_sd( scale - index ); if( count > 1022 ) { x = _mm_mul_sd( x, xscale ); xm = _mm_sub_epi64( xm, xstep ); count -= 1022; if( count > 1022 ) { x = _mm_mul_sd( x, xscale ); xm = _mm_sub_epi64( xm, xstep ); count -= 1022; if( count > 1022 ) return _mm_mul_sd( x, xscale ); } } xm = _mm_slli_epi64( xm, 52 ); return _mm_mul_sd( x, (xDouble) xm ); } //The original source from vMathLib used recursion (yuck!) //That was removed in favor of this more efficient loop static inline xDouble ipow( xDouble x, uint32_t u ) { xDouble result; uint32_t mask = 1; if( 0 == u ) return _mm_load_sd( &one ); //skip trailing zeros while( u && 0 == (u & mask) ) { //square x x = _mm_mul_sd( x,x ); //move up to the next mask mask += mask; } //this prevents an underflow from occurring for pow( tiny, 1 ) result = x; //clear the bit in u (so we don't loop forever) u &= ~mask; while( u ) { //square x x = _mm_mul_sd( x,x ); //move up to the next mask mask += mask; //if the current mask bit is set in u, multiply the result by x if( u & mask ) { result = _mm_mul_sd( result, x ); //clear the bit in u (so we don't loop forever) u &= ~mask; } } return result; } static inline xDouble _pow( xDouble X, xDouble Y ) { //get old fp env, and set the default one int oldMXCSR = _mm_getcsr(); int newMXCSR = (oldMXCSR | DEFAULT_MXCSR) & DEFAULT_MASK; //set standard masks, enable denormals, set round to nearest if( newMXCSR != oldMXCSR ) _mm_setcsr( newMXCSR ); static const double tenM20 = 1e-20; static const double ten20 = 1e20; static const double d16 = 16.0; static const double d128 = 128.0; static const double r128 = 1.0/128.0; static const double r8 = 1.0/8.0; static const double smallestNormal = 0x1.0p-1022; static const xUInt64 iOne = { 1, 0 }; //if y == 0 or x == 1, then the answer is always 1, even if the other argument is NaN if( _mm_istrue_sd( _mm_or_pd( _mm_cmpeq_sdm( X, &one ), _mm_cmpeq_sdm( Y, (double*) &minusZeroD ) )) ) { X = _mm_load_sd( &one ); if( newMXCSR != oldMXCSR ) _mm_setcsr( oldMXCSR ); return X; } //get NaN inputs out of the way if( _mm_istrue_sd( _mm_cmpunord_sd( X, Y ) ) ) { if( newMXCSR != oldMXCSR ) _mm_setcsr( oldMXCSR ); return _mm_add_sd( X, Y ); } xDouble fabsY = _mm_andnot_pd( minusZeroD, Y ); xDouble fabsX = _mm_andnot_pd( minusZeroD, X ); xDouble fabsyIsInf = _mm_cmpeq_sdm( fabsY, &plusinf ); xDouble signPatch = _mm_setzero_pd(); //detect if Y is integer, even or odd xDouble floorbias = _mm_load_sd( &T52 ); floorbias = _mm_and_pd( floorbias, _mm_cmplt_sd( fabsY, floorbias ) ); xDouble yShifted = _mm_add_sd( fabsY, floorbias); xDouble roundFabsY = _mm_sub_sd( yShifted, floorbias ); xDouble yIsEven = (xDouble) _mm_sub_epi64( _mm_and_si128( (xUInt64) yShifted, iOne ), iOne ); xDouble yIsInt = _mm_cmpeq_sd( roundFabsY, fabsY); int intY = _mm_cvtsd_si32( Y ); _mm_setcsr( newMXCSR ); //Avoid setting the inexact flag yIsEven = _mm_or_pd( yIsEven, _mm_cmpge_sdm( fabsY, &T53 ) ); //|y| >= 2**53 is always even yIsInt = _mm_andnot_pd( fabsyIsInf, yIsInt ); //Patch up the case of negative X with large integer Y. We have to do it here, //because recursion is not inlinable. xDouble fabsxLTInf = _mm_cmplt_sdm( fabsX, &plusinf ); if( _mm_istrue_sd( _mm_and_pd( _mm_cmplt_sdm( X, (double*) &minusZeroD ), fabsxLTInf ))) //X < 0 { if( _mm_istrue_sd( yIsInt ) ) { X = fabsX; //X = fabs(X) signPatch = _mm_andnot_pd( yIsEven, minusZeroD ); //flip the sign before exiting if y is an odd int //if x == -1, then the answer is always 1 or -1, unless NaN (removed above) if( _mm_istrue_sd( _mm_cmpeq_sdm( X, &one )) ) { X = _mm_load_sd( &one ); goto exit; } } else { if( _mm_istrue_sd( _mm_cmplt_sdm( fabsY, &plusinf ) ) ) { X = (xDouble) POWER_NAN; oldMXCSR |= INVALID_FLAG; goto exit; } } } xDouble xGTzero = _mm_cmpgt_sdm( X, (double*) &minusZeroD ); xDouble yLTzero = _mm_cmplt_sdm( Y, (double*) &minusZeroD ); xDouble xEQzero = _mm_cmpeq_sdm( X, (double*) &minusZeroD ); //More accurate, fast case for Y is a integer if( _mm_istrue_sd( _mm_andnot_pd( xEQzero, yIsInt)) && intY != 0x80000000 ) { if( intY < 0 ) { _mm_setcsr( DEFAULT_MXCSR ); xDouble recip = _mm_div_sd( _mm_load_sd( &one ), X ); //if the reciprocal is exact, use ipow if( 0 == (_mm_getcsr() & INEXACT_FLAG ) ) { X = ipow( recip, -intY ); oldMXCSR |= _mm_getcsr() & ALL_FLAGS; _mm_setcsr( oldMXCSR ); X = _mm_xor_pd( X, signPatch ); return X; } _mm_setcsr( newMXCSR ); } else { X = ipow( X, intY ); oldMXCSR |= _mm_getcsr() & ALL_FLAGS; _mm_setcsr( oldMXCSR ); X = _mm_xor_pd( X, signPatch ); return X; } } xDouble fabsyEQhalf = _mm_cmpeq_sdm( fabsY, &P1 ); //sqrt, reciprocal sqrt if( _mm_istrue_sd( _mm_and_pd( fabsyEQhalf, xGTzero ) ) ) { _mm_setcsr( oldMXCSR ); X = _MM_SQRT_SD( X ); if( _mm_istrue_sd( yLTzero ) ) X = _mm_div_sd( _mm_load_sd( &one ), X ); return X; } xDouble xLTinf = _mm_cmplt_sdm( X, &plusinf ); xDouble fabsyGT1em20 = _mm_cmpgt_sdm( fabsY, &tenM20 ); xDouble fabsyLT1e20 = _mm_cmplt_sdm( fabsY, &ten20 ); if( _mm_istrue_sd( _mm_and_pd( _mm_and_pd( xGTzero, xLTinf ), //x > 0 && x < inf _mm_and_pd( fabsyGT1em20, fabsyLT1e20 ) ) ) ) //|y| > 1e-20 && |y| < 1e20 { xDouble z1, z2, w1, w2; xDouble t1LTx = _mm_cmpgt_sdm( X, &T1 ); xDouble xLTt2 = _mm_cmplt_sdm( X, &T2 ); if( _mm_istrue_sd( _mm_and_pd( t1LTx, xLTt2 ) ) ) { /* |X - 1| < 2^-6 */ /* procedure Lsmall */ xDouble xEQone = _mm_cmpeq_sdm( X, &one ); if( _mm_istrue_sd( xEQone ) ) { X = _mm_load_sd( &one ); goto exit; } xDouble f = _mm_sub_sdm( X, &one ); xDouble f1 = _mm_cvtss_sd( f, _mm_cvtsd_ss( (xFloat)f, f )); xDouble f2 = _mm_sub_sd( f, f1 ); xDouble g = _mm_add_sdm( f, &two ); g = _mm_div_sd( _mm_load_sd( &one ), g ); xDouble u = _mm_mul_sd( f, g ); u = _mm_add_sd( u, u ); xDouble v = _mm_mul_sd( u, u ); xDouble q = _mm_add_sdm( _mm_mul_sdm( v, &C5 ), &C4 ); q = _mm_add_sdm( _mm_mul_sd( q, v ), &C3 ); q = _mm_add_sdm( _mm_mul_sd( q, v ), &C2 ); q = _mm_add_sdm( _mm_mul_sd( q, v ), &C1 ); v = _mm_mul_sd( v, u ); q = _mm_mul_sd( q, v ); xDouble u1 = _mm_cvtss_sd( u, _mm_cvtsd_ss( (xFloat)u, u )); xDouble u2 = _mm_sub_sd( f, u1 ); u2 = _mm_add_sd( u2, u2 ); u2 = _mm_sub_sd( u2, _mm_mul_sd( u1, f1 ) ); u2 = _mm_sub_sd( u2, _mm_mul_sd( u1, f2 ) ); u2 = _mm_mul_sd( u2, g ); z2 = _mm_add_sd( q, u2 ); z1 = _mm_add_sd( u1, z2 ); z1 = _mm_cvtss_sd( z1, _mm_cvtsd_ss( (xFloat)z1, z1 )); z2 = _mm_add_sd( z2, _mm_sub_sd( u1, z1 )); } else { // x = mantissa(X, &m); // m = logb(X); x = scalb(X, -m); //convert denormals to normals xDouble xOne = _mm_load_sd( &one ); xDouble isDenormal = _mm_cmplt_sdm( fabsX, &smallestNormal ); xDouble denormBias = _mm_and_pd( xOne, isDenormal ); int isdenorm = _mm_cvtsi128_si32( (xSInt64) isDenormal ); X = _mm_or_pd( X, denormBias ); X = _mm_sub_sd( X, denormBias ); //calculate mantissa and exponent xDouble inf = _mm_load_sd( &plusinf ); xDouble exponent = _mm_and_pd( X, inf); xDouble x = _mm_andnot_pd( inf, X ); xSInt64 xi = _mm_srli_epi64( (xUInt64) exponent, 52 ); x = _mm_or_pd( x, xOne ); xi = _mm_sub_epi32( xi, bias[1 + isdenorm] ); exponent = _mm_cvtepi32_pd( (xSInt32) xi ); int j = _mm_cvttsd_si32( _mm_add_sdm( _mm_mul_sdm( x, &d128) , &P1 )); xDouble F = _mm_mul_sdm( _mm_cvtsi32_sd( X, j ), &r128 ); xDouble g = _mm_div_sd( xOne, _mm_add_sd( F, x ) ); xDouble f = _mm_sub_sd( x, F ); xDouble a1 = _mm_add_sdm( _mm_mul_sdm( exponent, &logtable[128*2]), &logtable[(j-128)*2] ); xDouble a2 = _mm_add_sdm( _mm_mul_sdm( exponent, &logtable[128*2]+1), &logtable[(j-128)*2+1] ); xDouble u = _mm_mul_sd( f, g ); u = _mm_add_sd( u, u ); xDouble u1 = _mm_cvtss_sd( u, _mm_cvtsd_ss( (xFloat)u, u ) ); xDouble v = _mm_mul_sd( u, u ); xDouble c = _mm_mul_sd( Y, a1 ); c = _mm_andnot_pd( minusZeroD, c ); xDouble cLTsixteen = _mm_cmplt_sdm( c, &d16 ); xDouble cLToneeighth = _mm_cmplt_sdm( c, &r8 ); xDouble q, u2; if( _mm_istrue_sd( cLToneeighth ) ) { //u2 = g * (2 * (f - u1 * F) - u1 * f); u2 = _mm_sub_sd( f, _mm_mul_sd( u1, F ) ); u2 = _mm_add_sd( u2, u2 ); u2 = _mm_sub_sd( u2, _mm_mul_sd( u1, f ) ); u2 = _mm_mul_sd( u2, g ); //q = u * (v * (B1 + v * B2)); q = _mm_add_sdm( _mm_mul_sdm( v, &B2 ), &B1 ); q = _mm_mul_sd( q, v ); q = _mm_mul_sd( q, u ); } else { if( _mm_istrue_sd( cLTsixteen ) ) { //u2 = g * (2 * (f - u1 * F) - u1 * f); u2 = _mm_sub_sd( f, _mm_mul_sd( u1, F ) ); u2 = _mm_add_sd( u2, u2 ); u2 = _mm_sub_sd( u2, _mm_mul_sd( u1, f ) ); u2 = _mm_mul_sd( u2, g ); //q = u * (v * (A1 + v * (A2 + v * A3))); q = _mm_add_sdm( _mm_mul_sdm( v, &A3 ), &A2 ); q = _mm_add_sdm( _mm_mul_sd( q, v ), &A1 ); q = _mm_mul_sd( _mm_mul_sd( q, v ), u ); } else { xDouble f1 = _mm_cvtss_sd( f, _mm_cvtsd_ss( (xFloat) f, f )); xDouble f2 = _mm_sub_sd( f, f1 ); //u2 = g * ((2 * (f - u1 * F) - u1 * f1) - u1 * f2); u2 = _mm_sub_sd( f, _mm_mul_sd( u1, F ) ); //f - u1*f u2 = _mm_add_sd( u2, u2 ); //2*(f - u1*f) u2 = _mm_sub_sd( u2, _mm_mul_sd( u1, f1 ) ); //2*(f - u1*f) - u1*f1 u2 = _mm_sub_sd( u2, _mm_mul_sd( u1, f2 ) ); //(2*(f - u1*f) - u1*f1) - u1*f2 u2 = _mm_mul_sd( u2, g ); //g * ((2*(f - u1*f) - u1*f1) - u1*f2) //q = u * (v * (A1 + v * (A2 + v * A3))); q = _mm_add_sdm( _mm_mul_sdm( v, &A3 ), &A2 ); q = _mm_add_sdm( _mm_mul_sd( q, v ), &A1 ); q = _mm_mul_sd( _mm_mul_sd( q, v ), u ); } } xDouble p = _mm_add_sd( u2, q ); xDouble t = _mm_add_sd( a1, u1 ); a2 = _mm_add_sd( a2, _mm_add_sd( _mm_sub_sd( a1, t), u1 ) ); z2 = _mm_add_sd( p, a2 ); z1 = _mm_add_sd( t, z2 ); z1 = _mm_cvtss_sd( z1, _mm_cvtsd_ss( (xFloat) z1, z1 )); z2 = _mm_add_sd( z2, _mm_sub_sd( t, z1) ); } /* Procedure M */ { xDouble y1 = _mm_cvtss_sd( Y, _mm_cvtsd_ss( (xFloat) Y, Y )); xDouble y2 = _mm_sub_sd( Y, y1 ); w2 = _mm_add_sd( _mm_mul_sd( y2, z1 ), _mm_mul_sd( y2, z2 )); w2 = _mm_add_sd( w2, _mm_mul_sd( y1, z2 ) ); w1 = _mm_mul_sd( y1, z1 ); } //deal with overflow/underflow xDouble overflow = _mm_add_sd( w1, w2); xDouble underflow = _mm_cmplt_sdm( overflow, &Emin ); overflow = _mm_cmpgt_sdm( overflow, &Emax ); if( _mm_istrue_sd( _mm_or_pd( overflow, underflow ) ) ) { int overflowFlag = _mm_cvtsi128_si32( (xUInt64) overflow ) & OVERFLOW_FLAG; int underflowFlag = _mm_cvtsi128_si32( (xUInt64) underflow ) & UNDERFLOW_FLAG; oldMXCSR |= overflowFlag | underflowFlag | INEXACT_FLAG; X = _mm_and_pd( _mm_load_sd( &plusinf ), overflow ); goto exit; } /* Procedure E */ { int N = _mm_cvtsd_si32( _mm_mul_sdm( w1, &Inv_L) ); int j = N & 31; int M = N >> 5; xDouble dN = _mm_cvtsi32_sd( w1, N ); xDouble r = _mm_sub_sd( w1, _mm_mul_sdm( dN, &L1 ) ); r = _mm_add_sd( r, w2 ); r = _mm_sub_sd( r, _mm_mul_sdm( dN, &L2 ) ); //double p = r + r * (r * (P1 + r * (P2 + r * (P3 + r * (P4 + r * P5))))); xDouble p = _mm_add_sdm( _mm_mul_sdm( r, &P5 ), &P4 ); p = _mm_add_sdm( _mm_mul_sd( p, r ), &P3 ); p = _mm_add_sdm( _mm_mul_sd( p, r ), &P2 ); p = _mm_add_sdm( _mm_mul_sd( p, r ), &P1 ); p = _mm_add_sd( _mm_mul_sd( _mm_mul_sd( p, r ), r), r ); //double S = exptable[j].lead + exptable[j].trail; xDouble S = _mm_add_sdm( _mm_load_sd( &exptable[ 2 * j ] ), &exptable[ 2 * j +1] ); xDouble t = _mm_add_sdm( _mm_mul_sd( S, p ), &exptable[ 2 * j +1] ); t = _mm_add_sdm( t, &exptable[ 2 * j ] ); X = xscalb(t, M); oldMXCSR |= _mm_getcsr() & ALL_FLAGS; goto exit; } } //Handle special cases xDouble yIsOddInt = _mm_andnot_pd( yIsEven, yIsInt ); xDouble resultSign = _mm_and_pd( X, minusZeroD ); //take care of the x == 0 cases if( _mm_istrue_sd( xEQzero ) ) { //The x==1,NaN and y == 0, NaN cases have been weeded out above int div_zero_flag = _mm_cvtsi128_si32( (xSInt64) yLTzero ) & DIVIDE_0_FLAG; //result is Inf if y <0, otherwise result is 0 X = _mm_and_pd( _mm_load_sd( &plusinf), yLTzero ); //only apply the sign only if y is a odd integer resultSign = _mm_and_pd( yIsOddInt, resultSign ); X = _mm_or_pd( X, resultSign ); oldMXCSR |= div_zero_flag; goto exit; } //take care of the |x| == inf cases if( _mm_istrue_sd( _mm_cmpeq_sdm( fabsX, &plusinf ) ) ) { //result is Inf if y > 0, otherwise result is 0 X = _mm_andnot_pd( yLTzero, _mm_load_sd( &plusinf) ); //only apply the sign only if y is a odd integer resultSign = _mm_and_pd( yIsOddInt, resultSign ); X = _mm_or_pd( X, resultSign ); goto exit; } //take care of the |y| == inf cases if( _mm_istrue_sd( fabsyIsInf ) ) { xDouble fabsxGTone = _mm_cmpgt_sdm( fabsX, &one ); xDouble isInf = _mm_xor_pd( yLTzero, fabsxGTone ); xDouble isOne = _mm_cmpeq_sdm( X, &mOne ); // x == -1, |y| == inf returns 1 X = _mm_and_pd( isInf, _mm_load_sd( &plusinf ) ); X = _mm_sel_pd( X, _mm_load_sd( &one ), isOne ); X = _mm_xor_pd( X, signPatch ); goto exit; } //We have already handled the following cases: // y = +Inf, -Inf, +0, -0, NaN, // x = +Inf, -Inf, +0, -0, NaN, // // x is any and y is a small integer // x is negative and y is a large integer, |y| < 1e20 // x positive finite and |y| > 1e-20 && |y| < 1e20 // x is negative and y is not an integer // // We have left: // x is positive finite and |Y| is <= 1e-20 // x is positive finite and |Y| is >= 1e20 // x is negative finite and |Y| is >= 1e20 and an integer (we set x = fabs(x) above in this case, |Y| > 1e20 is always even) // x is negative finite and |Y| is <= 1e-20 and an integer (which never happens) // // we can be sure that X and Y are not inf, -inf, NaN, 0, -0 at this point, // since those cases are handled, therefore both X and Y are finite // // The cases: // y > 1e20 y < 1e-20 y< -1e20 // x > 1 inf oi 1.0 i 0.0 ui // x < 1 0.0 ui 1.0 i inf oi xDouble fabsxLTone = _mm_cmplt_sdm( fabsX, &one ); xDouble underflow = _mm_xor_pd( fabsxLTone, yLTzero ); xDouble overflow = _mm_andnot_pd( underflow, xLTinf ); //overflow = ~underflow xDouble fabsyLTone = _mm_cmplt_sdm( fabsY, &one ); xDouble fabsxEQone = _mm_cmpeq_sdm( fabsX, &one ); underflow = _mm_andnot_pd( fabsyLTone, underflow ); overflow = _mm_andnot_pd( fabsyLTone, overflow ); int overflowMask = _mm_cvtsi128_si32( (xUInt32) overflow ) & OVERFLOW_FLAG; int underflowMask = _mm_cvtsi128_si32( (xUInt32) underflow ) & UNDERFLOW_FLAG; int inexactMask = (~ _mm_cvtsi128_si32( (xUInt32) fabsxEQone )) & INEXACT_FLAG; X = _mm_and_pd( overflow, _mm_load_sd( &plusinf ) ); X = _mm_sel_pd( X, _mm_load_sd( &one), fabsyLTone ); oldMXCSR |= overflowMask | underflowMask | inexactMask; _mm_setcsr( oldMXCSR ); return X; exit: _mm_setcsr( oldMXCSR ); return _mm_xor_pd( X, signPatch ); } double pow( double x, double y ) { xDouble xd = DOUBLE_2_XDOUBLE( x ); xDouble yd = DOUBLE_2_XDOUBLE( y ); xd = _pow( xd, yd ); return XDOUBLE_2_DOUBLE( xd ); } float powf( float x, float y ) { xDouble xd = FLOAT_2_XDOUBLE( x ); xDouble yd = FLOAT_2_XDOUBLE( y ); xd = _pow( xd, yd ); return XDOUBLE_2_FLOAT( xd ); } #include #include // // This is not the worlds best power function. It is merely a simple one that // gives pretty good accuracy, with better range and precision than the double precision // one above. It should be off by no more than a few ulps, depending on the accuracy of // exp2 and log2, which are in turn depedent on the hardware fyl2x and . // // Since this is a long double implementation, I have chosen to be more conservative // about the level license I take with the C99 spec. It is rigorous about it's flags. // It is perhaps a bit too rigorous about the inexact flag, which we are allowed to abose. // It could be sped up by replacing nearbyintl with floor or round as appropriate. // long double powl( long double x, long double y ) { static const double neg_epsilon = 0x1.0p63; //if x = 1, return x for any y, even NaN if( x == 1.0 ) return x; //if y == 0, return 1 for any x, even NaN if( y == 0.0 ) return 1.0L; //get NaNs out of the way if( x != x || y != y ) return x + y; //do the work required to sort out edge cases long double fabsy = __builtin_fabsl( y ); long double fabsx = __builtin_fabsl( x ); long double infinity = __builtin_infl(); long double iy = nearbyintl( fabsy ); //we do round to nearest here so that |fy| <= 0.5 if( iy > fabsy )//convert nearbyint to floor iy -= 1.0L; int isOddInt = 0; if( fabsy == iy && fabsy != infinity && iy < neg_epsilon ) isOddInt = iy - 2.0L * nearbyintl( 0.5L * iy ); //might be 0, -1, or 1 ///test a few more edge cases //deal with x == 0 cases if( x == 0.0 ) { if( ! isOddInt ) x = 0.0L; if( y < 0 ) x = 1.0L/ x; return x; } //x == +-Inf cases if( fabsx == infinity ) { if( x < 0 ) { if( isOddInt ) { if( y < 0 ) return -0.0; else return -infinity; } else { if( y < 0 ) return 0.0; else return infinity; } } if( y < 0 ) return 0; return infinity; } //y = +-inf cases if( fabsy == infinity ) { if( x == -1 ) return 1; if( y < 0 ) { if( fabsx < 1 ) return infinity; return 0; } if( fabsx < 1 ) return 0; return infinity; } // x < 0 and y non integer case if( x < 0 && iy != fabsy ) return nanl("37") + sqrtl( -1 ); //speedy resolution of sqrt and reciprocal sqrt if( fabsy == 0.5 ) { x = sqrtl( x ); if( y < 0 ) x = 1.0L/ x; return x; } //Enter the main power function. This is done as: // // split up x**y as: // // x**y = x**(i+f) i = integer part of y, f = positive fractional part // = x**f * x**i // long double fy = fabsy - iy; long double fx = 1.0; long double ix = 1.0; //Calculate fx = x**f if( fy != 0 ) //This is expensive and may set unwanted flags. skip if unneeded { fx =log2l(x); long double fabsfx = __builtin_fabsl( fx ); long double min = fminl( fy, fabsfx ); long double max = fmaxl( fy, fabsfx ); if( y < 0 ) fy = -fy; //if fx * fy is a denormal, we get spurious underflow here, so try to avoid that if( min < 0x1.0p-8191L && max < neg_epsilon ) //a crude test for a denormal product { fx = 1; //for small numbers, skip straight to the result } else { //safe to do the work fx *= fy; fx = exp2l( fx ); } } //calculate ix = f**i //if y is negative, we will need to take the reciprocal eventually //Do it now to avoid underflow when we should get overflow //but don't do it if iy is zero to avoid spurious overflow if( y < 0 && iy != 0 ) x = 1.0L/ x; //calculate x**i by doing lots of multiplication while( iy != 0.0L ) { long double ylo; //early exit for underflow and overflow. Otherwise we may end up looping up to 16383 times here. if( x == 0.0 || x == infinity ) { ix *= x; //we know this is the right thing to do, because iy != 0 break; } //chop off 30 bits at a time if( iy > 0x1.0p30 ) { long double scaled = iy * 0x1.0p-30L; long double yhi = nearbyintl( scaled ); if( yhi > scaled ) yhi -= 1.0; ylo = iy - 0x1.0p30L * yhi; iy = yhi; } else { //faster code for the common case ylo = iy; iy = 0; } int j; int i = ylo; int mask = 1; //for each of the 30 bits set in i, multiply ix by x**(2**bit_position) if( i & 1 ) { ix *= x; i -= mask; } for( j = 0; j < 30 && i != 0; j++ ) { mask += mask; x *= x; if( i & mask ) { ix *= x; i -= mask; } } //we may have exited early from the loop above. //If so, and there are still bits in iy finish out the multiplies if( 0.0 != iy ) for( ; j < 30; j++ ) x *= x; } x = fx * ix; return x; } #pragma mark - #pragma mark cbrt static inline double _cbrt( double _x ) ALWAYS_INLINE; double _cbrt( double _x ) { static const double infinity = __builtin_inf(); static const double smallestNormal = 0x1.0p-1022; static const double twom968 = 0x1.0p-968; static const xUInt64 oneThird = { 0x55555556ULL, 0 }; static const xUInt32 denormBias = { 0, 696219795U, 0, 0 }; static const xUInt32 normalBias = { 0, 0x1200000U, 0, 0 }; static const double C = 5.42857142857142815906e-01; /* 19/35 = 0x3FE15F15, 0xF15F15F1 */ static const double D = -7.05306122448979611050e-01; /* -864/1225 = 0xBFE691DE, 0x2532C834 */ static const double E = 1.41428571428571436819e+00; /* 99/70 = 0x3FF6A0EA, 0x0EA0EA0F */ static const double F = 1.60714285714285720630e+00; /* 45/28 = 0x3FF9B6DB, 0x6DB6DB6E */ static const double G = 3.57142857142857150787e-01; /* 5/14 = 0x3FD6DB6D, 0xB6DB6DB7 */ static const xDouble expMask = { __builtin_inf(), 0 }; static const xUInt64 topMask = { 0xFFFFFFFF00000000ULL, 0 }; static const double twom20 = 0x1.0p-20; xDouble x = DOUBLE_2_XDOUBLE( _x ); if( EXPECT_FALSE( _mm_istrue_sd( _mm_cmpunord_sd( x, x ) ) ) ) return _x + _x; xDouble sign = _mm_and_pd( x, minusZeroD ); //set aside sign x = _mm_andnot_pd( minusZeroD, x ); //x = fabs(x) if( EXPECT_FALSE( _mm_istrue_sd( _mm_cmpeq_sdm( x, &infinity)) ) ) return _x; if( EXPECT_TRUE( _x != 0.0 ) ) { xDouble isDenorm = _mm_cmplt_sdm( x, &smallestNormal ); xDouble t = _mm_and_pd( _mm_load_sd( &twom968 ), isDenorm ); //t is 0 for normals, and 2**54 for denormals t = _mm_sub_sd( _mm_or_pd( t, x ), t ); //multiply of t * x for denormals. Exact for normals. //prepare cheesy estimate of t**1/3 by dividing high 32-bits by 3 xSInt32 addend = _mm_add_epi32( _mm_andnot_si128( (xSInt32) isDenorm, normalBias), denormBias ); t = (xDouble) _mm_mul_epu32( _mm_srli_epi64( (xSInt64) t, 32 ), oneThird ); t = (xDouble) _mm_add_epi32( (xSInt32) t, addend ); //new cbrt to a reported 23 bits (may be implemented in single precision xDouble r = _mm_div_sd( _mm_mul_sd( t, t ), x ); //r = t*t/x xDouble s = _mm_add_sdm( _mm_mul_sd( r, t ), &C ); //s = C + r*t xDouble y = _mm_add_sdm( s, &E ); xDouble z = _mm_div_sd( _mm_load_sd( &D ), s ); y = _mm_add_sd( y, z ); y = _mm_div_sd( _mm_load_sd( &F ), y ); y = _mm_add_sdm( y, &G ); t = _mm_mul_sd( t, y ); /* chopped to 20 bits and make it larger than cbrt(x) */ xDouble add = _mm_mul_sdm( t, &twom20 ); add = _mm_and_pd( add, expMask ); t = _mm_add_sd( t, add ); t = _mm_and_pd( t, (xDouble) topMask ); /* one step newton iteration to 53 bits with error less than 0.667 ulps */ s = _mm_mul_sd( t, t ); //exact r = _mm_div_sd( x, s ); xDouble w = _mm_add_sd( t, t ); w = _mm_add_sd( w, r ); r = _mm_sub_sd( r, t ); r = _mm_div_sd( r, w ); t = _mm_add_sd( t, _mm_mul_sd( t, r ) ); //restore sign t = _mm_or_pd( t, sign ); return XDOUBLE_2_DOUBLE( t ); } return _x; //result for 0.0 and -0.0 } double cbrt( double x ) { return _cbrt( x ); } float cbrtf( float x ) { return (float) _cbrt(x); }