Line data Source code
1 : #include "nut/debug.h"
2 : #include <stdlib.h>
3 : #include <string.h>
4 :
5 : #include <nut/matrix.h>
6 : #include <nut/modular_math.h>
7 : #include <nut/factorization.h>
8 : #include <nut/dirichlet.h>
9 : #include <nut/sieves.h>
10 :
11 1000 : uint64_t nut_dirichlet_D(uint64_t max, uint64_t m){
12 1000 : uint64_t y = nut_u64_nth_root(max, 2);
13 1000 : uint64_t res = 0;
14 21615 : for(uint64_t n = 1; n <= y; ++n){
15 20615 : uint64_t term = max/n;
16 20615 : res = m ? (res + term)%m : res + term;
17 : }
18 1000 : return m ? (2*res + m - (y*y)%m)%m : 2*res - y*y;
19 : }
20 :
21 0 : uint64_t nut_dirichlet_Sigma(uint64_t max, uint64_t m){
22 0 : uint64_t y = nut_u64_nth_root(max, 2);
23 0 : uint64_t res = 0;
24 0 : for(uint64_t n = 1; n <= y; ++n){
25 0 : uint64_t k = max/n;
26 0 : if(m){
27 0 : k %= m;
28 : }
29 0 : uint64_t term = (k&1) ? k*((k + 1)>>1) : (k>>1)*(k + 1);
30 0 : res = m ? (res + n*k + term)%m : res + n*k + term;
31 : }
32 0 : if(m){
33 0 : uint64_t cross = (y&1) ? y*y%m*((y + 1)>>1)%m : (y + 1)*(y>>1)%m*y%m;
34 0 : return (res + m - cross)%m;
35 : }else{
36 0 : return res - (y*y*(y + 1)>>1);
37 : }
38 : }
39 :
40 5613 : static inline bool is_composite_unpacked(uint64_t n, const uint8_t buf[static n/8 + 1]){
41 5613 : return buf[n/8] & (1 << (n%8));
42 : }
43 :
44 3937 : static inline void mark_composite_unpacked(uint64_t n, uint8_t buf[static n/8 + 1]){
45 3937 : buf[n/8] |= (1 << (n%8));
46 3937 : }
47 :
48 : // Implemented based on (https://gbroxey.github.io/blog/2023/04/30/mult-sum-1.html)
49 : // in turn based on (https://codeforces.com/blog/entry/54090)
50 : // which is just euler's sieve (https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes#Euler's_sieve)
51 : // this is a modification of the sieve of eratosthenes which marks off each number only once.
52 : // this makes strapping computation of an arbitrary multiplicative function on top of it easier,
53 : // but even though the codeforces post claims it is linear, in practice it will often perform worse than
54 : // a basic sieve of eratosthenes. However, preliminary testing shows that the sieve of Eratosthenes
55 : // is only about 14% faster than Euler's sieve in the release build, so it isn't worth upgrading.
56 : // Euler's sieve generally works as follows:
57 : // 1. Allocate result[1, ..., n] (in this case, this is f_conv_u_vals and smallest_ppow)
58 : // 2. Initialize is_c_buf[1, ..., n] = false (array of bit flags for each number from 1 to n, 0 for primes and 1 for composites)
59 : // 3. Initialize primes = [] a list of primes
60 : // 4. For i = 2, ..., n:
61 : // 5. If i is not marked as composite by now (checked using is_c_buf), add it to the list of primes primes
62 : // 6. Either way, for every prime p in primes:
63 : // 7. If p divides i: (p is the smallest prime divisor of i since in this case we will break out of this loop)
64 : // 8. If the largest power of the smallest prime divisor of i (smallest_ppow[i]) equals i, then i is a perfect power of p, and so is p*i
65 : // 9. In this case, we must compute (f <*> g)(p*i) using the definition of dirichlet convolution
66 : // 10. We can set smallest_ppow[i*p] = i*p
67 : // 11. Otherwise, p divides i but i is not a perfect power of p, so we can write i*p as smallest_ppow[i]*p * i/smallest_ppow[i]
68 : // 12. These two parts are coprime, so we can find (f <*> g)(i*p) = (f <*> g)(smallest_ppow[i]*p) * (f <*> g)(i/smallest_ppow[i])
69 : // 13. And smallest_ppow[i*p] = smallest_ppow[i]*p
70 : // 14. Either way, break out of the loop over p
71 : // 15. Otherwise, p does not divide i, so p and i are already coprime (and p is smaller than any prime divisor of i)
72 : // 16. So we can set (f <*> g)(i*p) = (f <*> g)(i) * (f <*> g)(p)
73 : // 17. And smallest_ppow[i*p] = p
74 : // Notice that we don't really need the smallest_ppow array, since we only use it when we know p the smallest prime divisor of i,
75 : // we could just use a while loop to count how many times p divides i or find the largest power of p which divides i, but this is a space/time tradeoff.
76 : // Also notice that we can cache smallest_ppow, primes, and is_c_buf, although the current implementation does not do that.
77 : // On the other hand, the sieve of Eratosthenes generally works as follows:
78 : // 1. Initialize result[1, ..., n] = 1
79 : // 2. For i = 2, ..., n:
80 : // 3. If result[i] != 1, i is not prime so we can continue
81 : // 4. Otherwise, i is prime (with a caveat noted below)
82 : // 5. For every power pp of i which is <= n:
83 : // 6. Compute (f <*> g)(pp) using the definition of dirichlet convolution, and store this as h
84 : // 7. For every multiple m of pp <= n starting at 2*pp WHICH IS NOT a multiple of pp*p:
85 : // 8. Multiply (f <*> g)(m) by h
86 : // When i is prime, result[i] will remain set to 1 when we get to the ith loop.
87 : // HOWEVER, when (f <*> g)(n) can be 1 for composite values of n, this is a problem because result[i] = 1 no longer means i is prime.
88 : // For convolutions which can be 1 for composite inputs, we would need to allocate a separate array to store whether or not each i is prime.
89 12 : bool nut_euler_sieve_conv_u(int64_t n, int64_t modulus, const int64_t f_vals[restrict static n+1], int64_t f_conv_u_vals[restrict static n+1]){
90 12 : int64_t *smallest_ppow = malloc((n+1)*sizeof(int64_t));
91 12 : uint8_t *is_c_buf = calloc(n/8 + 1, sizeof(uint8_t));
92 12 : uint64_t num_primes = 0;
93 12 : int64_t *primes = malloc(nut_max_primes_le(n)*sizeof(int64_t));
94 12 : if(!smallest_ppow || !is_c_buf || !primes){
95 0 : free(smallest_ppow);
96 0 : free(is_c_buf);
97 0 : free(primes);
98 0 : return false;
99 : }
100 12 : f_conv_u_vals[1] = 1;
101 2289 : for(int64_t i = 2; i <= n; ++i){
102 : // if i is prime, add it to the list of primes
103 2277 : if(!is_composite_unpacked(i, is_c_buf)){
104 439 : primes[num_primes++] = i;
105 439 : int64_t term = f_vals[i] + 1;
106 439 : f_conv_u_vals[i] = modulus ? nut_i64_mod(term, modulus) : term;
107 439 : smallest_ppow[i] = i;
108 : }
109 : // the key feature of euler's sieve is that it ONLY marks off multiples
110 : // of each number i times primes <= i. This requires storing an array of
111 : // primes and reading it A LOT of times, but ensures that each composite number
112 : // is visited only once
113 3358 : for(uint64_t j = 0; j < num_primes; ++j){
114 3358 : int64_t p = primes[j], m;
115 3358 : if(__builtin_mul_overflow(p, i, &m) || m > n){
116 : break;
117 : }
118 1838 : mark_composite_unpacked(m, is_c_buf);
119 1838 : if(i%p == 0){// i is a multiple of p, so in particular i and p are not coprime
120 : // also notice that because we break out of this loop at the bottom of this if statement,
121 : // p is in fact the smallest prime divisor of i
122 757 : int64_t ppow = smallest_ppow[i];
123 757 : int64_t B = ppow*p;
124 757 : smallest_ppow[m] = B;
125 757 : int64_t v = i/ppow;
126 757 : if(v != 1){// i is not a perfect power of p, that is, i = v*p**a with v != 1
127 650 : int64_t term = f_conv_u_vals[v]*f_conv_u_vals[B];
128 650 : f_conv_u_vals[m] = modulus ? nut_i64_mod(term, modulus) : term;
129 : }else{// i is a perfect power of p (and so is m)
130 : // from the definition, (f <*> u)(p**a) = f(1)*u(p**a) + f(p)*u(p**(a-1)) + ... + f(p**a)*u(1)
131 : // = f(1) + f(p) + ... + f(p**a) = (f <*> u)(p**(a-1)) + f(p**a)
132 107 : int64_t term = f_conv_u_vals[i] + f_vals[m];
133 107 : f_conv_u_vals[m] = modulus ? nut_i64_mod(term, modulus) : term;
134 : }
135 : break;
136 : }// otherwise, i and p are coprime, so (f <*> u)(i*p) = (f <*> u)(i)*(f <*> u)(p)
137 1081 : int64_t term = f_conv_u_vals[i]*f_conv_u_vals[p];
138 1081 : f_conv_u_vals[m] = modulus ? nut_i64_mod(term, modulus) : term;
139 1081 : smallest_ppow[m] = p;
140 : }
141 : }
142 12 : free(smallest_ppow);
143 12 : free(primes);
144 12 : free(is_c_buf);
145 12 : return true;
146 : }
147 :
148 1 : bool nut_euler_sieve_conv_N(int64_t n, int64_t modulus, const int64_t f_vals[restrict static n+1], int64_t f_conv_N_vals[restrict static n+1]){
149 1 : int64_t *smallest_ppow = malloc((n+1)*sizeof(int64_t));
150 1 : uint8_t *is_c_buf = calloc(n/8 + 1, sizeof(uint8_t));
151 1 : uint64_t num_primes = 0;
152 1 : int64_t *primes = malloc(nut_max_primes_le(n)*sizeof(int64_t));
153 1 : if(!smallest_ppow || !is_c_buf || !primes){
154 0 : free(smallest_ppow);
155 0 : free(is_c_buf);
156 0 : free(primes);
157 0 : return false;
158 : }
159 1 : f_conv_N_vals[1] = 1;
160 31 : for(int64_t i = 2; i <= n; ++i){
161 : // if i is prime, add it to the list of primes
162 30 : if(!is_composite_unpacked(i, is_c_buf)){
163 11 : primes[num_primes++] = i;
164 11 : int64_t term = f_vals[i] + i;
165 11 : f_conv_N_vals[i] = modulus ? nut_i64_mod(term, modulus) : term;
166 11 : smallest_ppow[i] = i;
167 : }
168 : // the key feature of euler's sieve is that it ONLY marks off multiples
169 : // of each number i times primes <= i. This requires storing an array of
170 : // primes and reading it A LOT of times, but ensures that each composite number
171 : // is visited only once
172 39 : for(uint64_t j = 0; j < num_primes; ++j){
173 39 : int64_t p = primes[j], m;
174 39 : if(__builtin_mul_overflow(p, i, &m) || m > n){
175 : break;
176 : }
177 19 : mark_composite_unpacked(m, is_c_buf);
178 19 : if(i%p == 0){// i is a multiple of p, so in particular i and p are not coprime
179 : // also notice that because we break out of this loop at the bottom of this if statement,
180 : // p is in fact the smallest prime divisor of i
181 10 : int64_t ppow = smallest_ppow[i];
182 10 : int64_t B = ppow*p;
183 10 : smallest_ppow[m] = B;
184 10 : int64_t v = i/ppow;
185 10 : if(v != 1){// i is not a perfect power of p, that is, i = v*p**a with v != 1
186 4 : int64_t term = f_conv_N_vals[v]*f_conv_N_vals[B];
187 4 : f_conv_N_vals[m] = modulus ? nut_i64_mod(term, modulus) : term;
188 : }else{// i is a perfect power of p (and so is m)
189 : // from the definition, (f <*> N)(p**a) = f(1)*N(p**a) + f(p)*N(p**(a-1)) + ... + f(p**a)*N(1)
190 : // = f(1)*p**a + f(p)*p**(a-1) + ... + f(p**a)*1 = p*(f <*> N)(p**(a-1)) + f(p**a)
191 6 : if(modulus){
192 0 : int64_t term = (p%modulus)*f_conv_N_vals[i] + f_vals[m];
193 0 : f_conv_N_vals[m] = nut_i64_mod(term, modulus);
194 : }else{
195 6 : f_conv_N_vals[m] = p*f_conv_N_vals[i] + f_vals[m];
196 : }
197 : }
198 : break;
199 : }// otherwise, i and p are coprime, so (f <*> u)(i*p) = (f <*> u)(i)*(f <*> u)(p)
200 9 : int64_t term = f_conv_N_vals[i]*f_conv_N_vals[p];
201 9 : f_conv_N_vals[m] = modulus ? nut_i64_mod(term, modulus) : term;
202 9 : smallest_ppow[m] = p;
203 : }
204 : }
205 1 : free(smallest_ppow);
206 1 : free(primes);
207 1 : free(is_c_buf);
208 1 : return true;
209 : }
210 :
211 178 : bool nut_euler_sieve_conv(int64_t n, int64_t modulus, const int64_t f_vals[static n+1], const int64_t g_vals[static n+1], int64_t f_conv_vals[restrict static n+1]){
212 178 : int64_t *smallest_ppow = malloc((n+1)*sizeof(int64_t));
213 178 : uint8_t *is_c_buf = calloc(n/8 + 1, sizeof(uint8_t));
214 178 : uint64_t num_primes = 0;
215 178 : int64_t *primes = malloc(nut_max_primes_le(n)*sizeof(int64_t));
216 178 : if(!smallest_ppow || !is_c_buf || !primes){
217 0 : free(smallest_ppow);
218 0 : free(is_c_buf);
219 0 : free(primes);
220 0 : return false;
221 : }
222 178 : f_conv_vals[1] = 1;
223 3484 : for(int64_t i = 2; i <= n; ++i){
224 : // if i is prime, add it to the list of primes
225 3306 : if(!is_composite_unpacked(i, is_c_buf)){
226 1226 : primes[num_primes++] = i;
227 1226 : int64_t term = f_vals[i] + g_vals[i];
228 1226 : f_conv_vals[i] = modulus ? nut_i64_mod(term, modulus) : term;
229 1226 : smallest_ppow[i] = i;
230 : }
231 : // the key feature of euler's sieve is that it ONLY marks off multiples
232 : // of each number i times primes <= i. This requires storing an array of
233 : // primes and reading it A LOT of times, but ensures that each composite number
234 : // is visited only once
235 4341 : for(uint64_t j = 0; j < num_primes; ++j){
236 4341 : int64_t p = primes[j], m;
237 4341 : if(__builtin_mul_overflow(p, i, &m) || m > n){
238 : break;
239 : }
240 2080 : mark_composite_unpacked(m, is_c_buf);
241 2080 : if(i%p == 0){// i is a multiple of p, so in particular i and p are not coprime
242 : // also notice that because we break out of this loop at the bottom of this if statement,
243 : // p is in fact the smallest prime divisor of i
244 1045 : int64_t ppow = smallest_ppow[i];
245 1045 : int64_t B = ppow*p;
246 1045 : smallest_ppow[m] = B;
247 1045 : int64_t v = i/ppow;
248 1045 : if(v != 1){// i is not a perfect power of p, that is, i = v*p**a with v != 1
249 483 : int64_t term = f_conv_vals[v]*f_conv_vals[B];
250 483 : f_conv_vals[m] = modulus ? nut_i64_mod(term, modulus) : term;
251 : }else{// i is a perfect power of p (and so is m)
252 : // from the definition, (f <*> g)(p**a) = f(1)*g(p**a) + f(p)*g(p**(a-1)) + ... + f(p**a)*g(1)
253 562 : int64_t c = f_vals[m] + g_vals[m];
254 562 : if(modulus){
255 510 : c = nut_i64_mod(c, modulus);
256 : }
257 : int64_t A = p;
258 768 : while(A < ppow){
259 206 : if(modulus){
260 170 : c = nut_i64_mod(c + f_vals[A]*g_vals[ppow], modulus);
261 170 : c = nut_i64_mod(c + f_vals[ppow]*g_vals[A], modulus);
262 : }else{
263 36 : c += f_vals[A]*g_vals[ppow] + f_vals[ppow]*g_vals[A];
264 : }
265 206 : A *= p;
266 206 : ppow /= p;
267 : }
268 562 : if(A == ppow){
269 375 : if(modulus){
270 340 : c = nut_i64_mod(c + f_vals[A]*g_vals[A], modulus);
271 : }else{
272 35 : c += f_vals[A]*g_vals[A];
273 : }
274 : }
275 562 : f_conv_vals[m] = c;
276 : }
277 : break;
278 : }// otherwise, i and p are coprime, so (f <*> u)(i*p) = (f <*> u)(i)*(f <*> u)(p)
279 1035 : int64_t term = f_conv_vals[i]*f_conv_vals[p];
280 1035 : f_conv_vals[m] = modulus ? nut_i64_mod(term, modulus) : term;
281 1035 : smallest_ppow[m] = p;
282 : }
283 : }
284 178 : free(smallest_ppow);
285 178 : free(primes);
286 178 : free(is_c_buf);
287 178 : return true;
288 : }
289 :
290 25 : bool nut_Diri_init(nut_Diri *self, int64_t x, int64_t y){
291 25 : int64_t ymin = nut_u64_nth_root(x, 2);
292 25 : if(y < ymin){
293 : y = ymin;
294 : }
295 25 : int64_t yinv = x/y + 1;
296 25 : if(!(self->buf = malloc((y + yinv)*sizeof(int64_t)))){
297 : return false;
298 : }
299 25 : self->x = x;
300 25 : self->y = y;
301 25 : self->yinv = yinv;
302 25 : return true;
303 : }
304 :
305 25 : void nut_Diri_destroy(nut_Diri *self){
306 25 : free(self->buf);
307 25 : *self = (nut_Diri){};
308 25 : }
309 :
310 35 : void nut_Diri_copy(nut_Diri *restrict dest, const nut_Diri *restrict src){
311 35 : memcpy(dest->buf, src->buf, (src->y + src->yinv)*sizeof(int64_t));
312 35 : }
313 :
314 0 : void nut_Diri_compute_I(nut_Diri *self){
315 0 : memset(self->buf, 0, (self->y + 1)*sizeof(int64_t));
316 0 : self->buf[1] = 1;
317 0 : for(int64_t i = 1; i < self->yinv; ++i){
318 0 : self->buf[self->y + i] = 1;
319 : }
320 0 : }
321 :
322 25 : void nut_Diri_compute_u(nut_Diri *self, int64_t m){
323 439 : for(int64_t i = 0; i <= self->y; ++i){
324 414 : self->buf[i] = 1;
325 : }
326 436 : for(int64_t i = 1; i < self->yinv; ++i){
327 411 : int64_t term = self->x/i;
328 411 : self->buf[self->y + i] = m ? nut_i64_mod(term, m) : term;
329 : }
330 25 : }
331 :
332 0 : void nut_Diri_compute_N(nut_Diri *self, int64_t m){
333 0 : for(int64_t i = 0; i <= self->y; ++i){
334 0 : self->buf[i] = m ? i%m : i;
335 : }
336 0 : for(int64_t i = 1; i < self->yinv; ++i){
337 0 : int64_t v = self->x/i;
338 0 : if(m){
339 0 : v %= m;
340 : }
341 0 : int64_t term = (v&1) ? v*((v + 1) >> 1) : (v + 1)*(v >> 1);
342 0 : self->buf[self->y + i] = m ? term%m : term;
343 : }
344 0 : }
345 :
346 4 : bool nut_Diri_compute_Nk(nut_Diri *restrict self, uint64_t k, int64_t m){
347 4 : if(!m){
348 2 : nut_i64_Matrix pascal_mat [[gnu::cleanup(nut_i64_Matrix_destroy)]] = {}, faulhaber_mat [[gnu::cleanup(nut_i64_Matrix_destroy)]] = {};
349 2 : int64_t *vand_vec [[gnu::cleanup(cleanup_free)]] = malloc((k + 1)*sizeof(int64_t));
350 1 : if(!vand_vec){
351 : return false;
352 : }
353 1 : if(!nut_i64_Matrix_init(&pascal_mat, k + 1, k + 1)){
354 : return false;
355 : }
356 1 : if(!nut_i64_Matrix_init(&faulhaber_mat, k + 1, k + 1)){
357 : return false;
358 : }
359 1 : nut_i64_Matrix_fill_short_pascal(&pascal_mat);
360 1 : int64_t denom = nut_i64_Matrix_invert_ltr(&pascal_mat, &faulhaber_mat);
361 33 : for(int64_t i = 0; i <= self->y; ++i){
362 32 : self->buf[i] = nut_u64_pow(i, k);
363 : }
364 33 : for(int64_t i = 1; i < self->yinv; ++i){
365 32 : int64_t v = self->x/i;
366 32 : nut_i64_Matrix_fill_vandemond_vec(v + 1, k + 1, m, vand_vec);
367 32 : int64_t tot = 0;
368 192 : for(uint64_t j = 0; j < k + 1; ++j){
369 160 : int64_t a = faulhaber_mat.buf[(k + 1)*k + j]*vand_vec[j];
370 160 : tot = tot + a;
371 : }
372 32 : tot /= denom;
373 32 : self->buf[self->y + i] = tot;
374 : }
375 : return true;
376 : }else{
377 6 : nut_u64_ModMatrix pascal_mat [[gnu::cleanup(nut_u64_ModMatrix_destroy)]] = {}, faulhaber_mat [[gnu::cleanup(nut_u64_ModMatrix_destroy)]] = {};
378 6 : uint64_t *vand_vec [[gnu::cleanup(cleanup_free)]] = malloc((k + 1)*sizeof(uint64_t));
379 3 : if(!vand_vec){
380 : return false;
381 : }
382 3 : if(!nut_u64_ModMatrix_init(&pascal_mat, k + 1, k + 1, m)){
383 : return false;
384 : }
385 3 : if(!nut_u64_ModMatrix_init(&faulhaber_mat, k + 1, k + 1, m)){
386 : return false;
387 : }
388 3 : nut_u64_ModMatrix_fill_short_pascal(&pascal_mat);
389 3 : if(!nut_u64_ModMatrix_invert_ltr(&pascal_mat, &faulhaber_mat)){
390 : return false;
391 : }
392 36 : for(int64_t i = 0; i <= self->y; ++i){
393 33 : self->buf[i] = nut_u64_powmod(i, k, m);
394 : }
395 33 : for(int64_t i = 1; i < self->yinv; ++i){
396 30 : int64_t v = self->x/i;
397 30 : nut_u64_Matrix_fill_vandemond_vec((v + 1)%m, k + 1, m, vand_vec);
398 30 : int64_t tot = 0;
399 200 : for(uint64_t j = 0; j < k + 1; ++j){
400 170 : uint64_t a = faulhaber_mat.buf[(k + 1)*k + j]*vand_vec[j];
401 170 : tot = (tot + a)%m;
402 : }
403 30 : self->buf[self->y + i] = tot;
404 : }
405 : return true;
406 : }
407 : }
408 :
409 2 : void nut_Diri_compute_mertens(nut_Diri *restrict self, int64_t m, const uint8_t mobius[restrict static self->y/4 + 1]){
410 2 : nut_Diri_set_dense(self, 0, 0);
411 2 : nut_Diri_set_dense(self, 1, 1);
412 1160428 : for(int64_t i = 2, acc = 1, v; i <= self->y; ++i){
413 1160426 : if((v = nut_Bitfield2_arr_get(mobius, i)) == 3){
414 352703 : v = -1;
415 : }
416 1160426 : nut_Diri_set_dense(self, i, acc += v);
417 : }
418 8651 : for(int64_t i = self->yinv - 1; i; --i){
419 8649 : int64_t v = self->x/i;
420 8649 : int64_t M = 1;
421 8649 : int64_t vr = nut_u64_nth_root(v, 2);
422 18424698 : for(int64_t j = 1; j <= vr; ++j){
423 18416049 : int64_t term = (nut_Diri_get_dense(self, j) - nut_Diri_get_dense(self, j - 1))*(v/j);
424 18416049 : if(m){
425 0 : M = nut_i64_mod(M - term, m);
426 : }else{
427 18416049 : M -= term;
428 : }
429 : }
430 18416049 : for(int64_t j = 2, term; j <= vr; ++j){
431 18407400 : if(i*j >= self->yinv){
432 18336517 : term = nut_Diri_get_dense(self, v/j);
433 : }else{
434 70883 : term = nut_Diri_get_sparse(self, i*j);
435 : }
436 18407400 : if(m){
437 0 : M = nut_i64_mod(M - term, m);
438 : }else{
439 18407400 : M -= term;
440 : }
441 : }
442 8649 : int64_t term;
443 8649 : if(vr <= self->y){
444 8649 : term = nut_Diri_get_dense(self, vr)*vr;
445 : }else{
446 0 : term = nut_Diri_get_sparse(self, self->x/vr)*vr;
447 : }
448 8649 : if(m){
449 0 : M = nut_i64_mod(M + term, m);
450 : }else{
451 8649 : M += term;
452 : }
453 8649 : nut_Diri_set_sparse(self, i, M);
454 : }
455 1160428 : for(int64_t i = 2, v; i <= self->y; ++i){
456 1160426 : if((v = nut_Bitfield2_arr_get(mobius, i)) == 3){
457 352703 : v = -1;
458 : }
459 1160426 : nut_Diri_set_dense(self, i, v);
460 : }
461 2 : }
462 :
463 20 : bool nut_Diri_compute_dk(nut_Diri *restrict self, uint64_t k, int64_t m, nut_Diri *restrict f_tbl, nut_Diri *restrict g_tbl){
464 20 : if(self->y != f_tbl->y || self->y != g_tbl->y || self->x != f_tbl->x || self->x != g_tbl->x){
465 0 : return false;
466 20 : }else if(!k){
467 0 : nut_Diri_compute_I(self);
468 0 : return true;
469 20 : }else if(k == 1){
470 0 : nut_Diri_compute_u(self, m);
471 0 : return true;
472 : }
473 20 : nut_Diri_compute_u(f_tbl, m);
474 20 : nut_Diri *t = self, *s = f_tbl, *r = g_tbl;
475 20 : while(k%2 == 0){
476 0 : if(!nut_Diri_compute_conv(t, m, s, s)){
477 : return false;
478 : }else{
479 0 : void *tmp = t;
480 0 : t = s;
481 0 : s = tmp;
482 : }//s = s*s
483 0 : k >>= 1;
484 : }
485 20 : nut_Diri_copy(r, s);
486 131 : while((k >>= 1)){
487 111 : if(!nut_Diri_compute_conv(t, m, s, s)){
488 : return false;
489 : }else{
490 111 : void *tmp = t;
491 111 : t = s;
492 111 : s = tmp;
493 : }//s = s*s
494 111 : if(k%2){
495 62 : if(!nut_Diri_compute_conv(t, m, r, s)){
496 : return false;
497 : }else{
498 : void *tmp = t;
499 : t = r;
500 : r = tmp;
501 : }//r = r*s
502 : }
503 : }
504 20 : if(r != self){
505 13 : nut_Diri_copy(self, r);
506 : }
507 : return true;
508 : }
509 :
510 0 : bool nut_Diri_compute_J(nut_Diri *restrict self, uint64_t p){
511 0 : int64_t *partial_sums [[gnu::cleanup(cleanup_free)]] = malloc(p*sizeof(int64_t));
512 0 : uint64_t *is_qr [[gnu::cleanup(cleanup_free)]] = nut_u64_make_jacobi_tbl(p, partial_sums);
513 0 : if(!partial_sums || !is_qr){
514 : return false;
515 : }
516 0 : for(uint64_t r = 0; r < p; ++r){
517 0 : int64_t v = nut_u64_jacobi_tbl_get(r, p, is_qr);
518 0 : for(int64_t i = r; i <= self->y; i += p){
519 0 : self->buf[i] = v;
520 : }
521 : }
522 0 : for(int64_t i = 1; i < self->yinv; ++i){
523 0 : int64_t r = self->x/i%p;
524 0 : self->buf[self->y + i] = partial_sums[r];
525 : }
526 : return true;
527 : }
528 :
529 0 : bool nut_Diri_compute_Chi_balanced(nut_Diri *restrict self, uint64_t n, int64_t m, const int64_t period_tbl[restrict static n]){
530 0 : int64_t *partial_sums [[gnu::cleanup(cleanup_free)]] = malloc(n*sizeof(int64_t));
531 0 : if(!partial_sums){
532 : return false;
533 : }
534 0 : int64_t acc = period_tbl[0];
535 0 : partial_sums[0] = acc;
536 0 : for(uint64_t r = 1; r < n; ++r){
537 0 : acc += period_tbl[r];
538 0 : if(acc >= m){
539 0 : acc -= m;
540 : }
541 0 : partial_sums[r] = acc;
542 : }
543 0 : for(uint64_t r = 0; r < n; ++r){
544 0 : int64_t v = period_tbl[r];
545 0 : for(int64_t i = r; i <= self->y; i += n){
546 0 : self->buf[i] = v;
547 : }
548 : }
549 0 : for(int64_t i = 1; i < self->yinv; ++i){
550 0 : int64_t r = self->x/i%n;
551 0 : self->buf[self->y + i] = partial_sums[r];
552 : }
553 : return true;
554 : }
555 :
556 1 : bool nut_Diri_compute_pi(nut_Diri *restrict self){
557 1 : self->buf[0] = 0;
558 1001 : for(int64_t i = 1; i <= self->y; ++i){
559 1000 : self->buf[i] = i - 1;
560 : }
561 1001 : for(int64_t i = 1; i < self->yinv; ++i){
562 1000 : int64_t term = self->x/i;
563 1000 : self->buf[self->y + i] = term - 1;
564 : }
565 1000 : for(int64_t p = 2; p <= self->y; ++p){
566 999 : int64_t c = self->buf[p - 1];
567 999 : if(self->buf[p] == c){
568 831 : continue;
569 : }
570 17167 : for(int64_t i = 1; i < self->yinv; ++i){
571 17156 : int64_t v = self->x/i;
572 17156 : if(v < p*p){
573 157 : goto CONTINUE_PRIMELOOP;
574 : }
575 16999 : int64_t j = v/p;
576 16999 : int64_t m = j <= self->y ? self->buf[j] : self->buf[self->y + self->x/j];
577 16999 : self->buf[self->y + i] -= m - c;
578 : }
579 7664 : for(int64_t v = self->y; v >= 0; --v){
580 7664 : if(v < p*p){
581 : break;
582 : }
583 7653 : self->buf[v] -= (self->buf[v/p] - c);
584 : }
585 999 : CONTINUE_PRIMELOOP:;
586 : }
587 1 : return true;
588 : }
589 :
590 10 : bool nut_Diri_compute_conv_u(nut_Diri *restrict self, int64_t m, const nut_Diri *restrict f_tbl){
591 10 : if(self->y != f_tbl->y || self->x != f_tbl->x){
592 0 : return false;
593 : }
594 : // use the dense part of self to temporarily store the sums of f for small n up to y
595 10 : self->buf[0] = 0;
596 299 : for(int64_t i = 1; i <= self->y; ++i){
597 289 : int64_t term = self->buf[i-1] + f_tbl->buf[i];
598 289 : self->buf[i] = m ? nut_i64_mod(term, m) : term;
599 : }
600 308 : for(int64_t i = 1; i < self->yinv; ++i){
601 298 : int64_t v = self->x/i;
602 298 : int64_t vr = nut_u64_nth_root(v, 2);// TODO: v is close to the previous v, so only one newton step should be needed here
603 298 : int64_t h = 0;
604 3027 : for(int64_t n = 1, term; n <= vr; ++n){
605 2729 : if(v/n <= self->y){
606 1689 : term = nut_Diri_get_dense(self, v/n);
607 : }else{
608 1040 : term = nut_Diri_get_sparse(f_tbl, i*n);
609 : }
610 2729 : if(m){
611 0 : h = nut_i64_mod(h + term, m);
612 : }else{
613 2729 : h += term;
614 : }
615 2729 : if(m){
616 0 : term = (v/n)%m*nut_Diri_get_dense(f_tbl, n);
617 0 : h = nut_i64_mod(h + term, m);
618 : }else{
619 2729 : h += nut_Diri_get_dense(f_tbl, n)*(v/n);
620 : }
621 : }
622 298 : int64_t term = nut_Diri_get_dense(self, vr)*vr;
623 298 : if(m){
624 0 : h = nut_i64_mod(h - term, m);
625 : }else{
626 298 : h -= term;
627 : }
628 298 : nut_Diri_set_sparse(self, i, h);
629 : }
630 10 : return nut_euler_sieve_conv_u(self->y, m, f_tbl->buf, self->buf);
631 : }
632 :
633 1 : bool nut_Diri_compute_conv_N(nut_Diri *restrict self, int64_t m, const nut_Diri *restrict f_tbl){
634 1 : if(self->y != f_tbl->y || self->x != f_tbl->x){
635 0 : return false;
636 : }
637 : // use the dense part of self to temporarily store the sums of f for small n up to y
638 1 : self->buf[0] = 0;
639 32 : for(int64_t i = 1; i <= self->y; ++i){
640 31 : int64_t term = self->buf[i-1] + f_tbl->buf[i];
641 31 : self->buf[i] = m ? nut_i64_mod(term, m) : term;
642 : }
643 33 : for(int64_t i = 1; i < self->yinv; ++i){
644 32 : int64_t v = self->x/i;
645 32 : int64_t vr = nut_u64_nth_root(v, 2);
646 32 : int64_t h = 0;
647 : // by hyperbola formula:
648 : // (f <*> N)(v) = sum(n = 1 ... vr, f(n)*(v/n)*((v/n) + 1)/2) + sum(n = 1 ... vr, F(v/n)*n) - F(vr)*vr*(vr + 1)/2
649 : // both sums are folded together into the following loop
650 330 : for(int64_t n = 1, term; n <= vr; ++n){
651 298 : if(v/n <= self->y){
652 185 : term = nut_Diri_get_dense(self, v/n)*n;
653 : }else{
654 113 : term = nut_Diri_get_sparse(f_tbl, i*n)*n;
655 : }
656 298 : if(m){
657 0 : h = nut_i64_mod(h + term, m);
658 : }else{
659 298 : h += term;
660 : }
661 298 : term = nut_Diri_get_dense(f_tbl, n);
662 298 : int64_t k = v/n;
663 298 : if(m){
664 0 : int64_t Gvn = (k&1) ? k%m*(((k + 1)>>1)%m) : (k>>1)%m*((k + 1)%m);
665 0 : Gvn %= m;
666 0 : h = nut_i64_mod(h + term*Gvn, m);
667 : }else{
668 298 : int64_t Gvn = (k&1) ? k*((k + 1)>>1) : (k>>1)*(k + 1);
669 298 : h += term*Gvn;
670 : }
671 : }
672 : // finally we apply the corrective term (remove double counted values)
673 32 : int64_t term = nut_Diri_get_dense(self, vr);
674 32 : int64_t Gvr = (vr&1) ? vr*((vr + 1) >> 1) : (vr >> 1)*(vr + 1);
675 32 : if(m){
676 0 : Gvr %= m;
677 0 : h = nut_i64_mod(h - term*Gvr, m);
678 : }else{
679 32 : h -= term*Gvr;
680 : }
681 32 : nut_Diri_set_sparse(self, i, h);
682 : }
683 1 : return nut_euler_sieve_conv_N(self->y, m, f_tbl->buf, self->buf);
684 : }
685 :
686 177 : bool nut_Diri_compute_conv(nut_Diri *restrict self, int64_t m, const nut_Diri *f_tbl, const nut_Diri *g_tbl){
687 177 : if(self->y != f_tbl->y || self->x != f_tbl->x || self->y != g_tbl->y || self->x != g_tbl->x){
688 0 : return false;
689 : }
690 : // use the dense part of self to temporarily store the sums of f for small n up to y
691 177 : self->buf[0] = 0;
692 2661 : for(int64_t i = 1; i <= self->y; ++i){
693 2484 : int64_t term = self->buf[i-1] + f_tbl->buf[i];
694 2484 : self->buf[i] = m ? nut_i64_mod(term, m) : term;
695 : }
696 2831 : for(int64_t i = 1; i < self->yinv; ++i){
697 2654 : int64_t v = self->x/i;
698 2654 : int64_t vr = nut_u64_nth_root(v, 2);
699 2654 : int64_t h = 0;
700 18195 : for(int64_t n = 1, term; n <= vr; ++n){
701 15541 : if(v/n <= self->y){
702 8266 : term = nut_Diri_get_dense(self, v/n)*nut_Diri_get_dense(g_tbl, n);
703 : }else{
704 7275 : term = nut_Diri_get_sparse(f_tbl, i*n)*nut_Diri_get_dense(g_tbl, n);
705 : }
706 15541 : if(m){
707 14710 : h = nut_i64_mod(h + term, m);
708 : }else{
709 831 : h += term;
710 : }
711 : }
712 2654 : nut_Diri_set_sparse(self, i, h);
713 : }
714 : // use the dense part of self to temporarily store the sums of g for small n up to y
715 : // simultaniously, apply the adjustments to the h values.
716 : // H(x/j) has an adjustment of F(i)G(i) where i = sqrt(x/j)
717 : // so for every i = sqrt(x/j) value, we need to adjust all H(x/j) values
718 : // for j in the range (x/(i + 1)**2, x/i**2] && [1, yinv)
719 : // So we can ignore some small i values where this intersection is empty, that is, yinv <= x/(i + 1)**2
720 : // (i + 1)**2 <= x/yinv <=> i + 1 <= sqrt(x/yinv)
721 : // that is, for i <= sqrt(x/yinv) - 1, there are no corresponding j to adjust
722 : // note that x/yinv = y and i <= sqrt(y) - 1 <=> i < sqrt(y)
723 177 : int64_t i_ub = nut_u64_nth_root(self->y, 2);
724 535 : for(int64_t i = 1; i < i_ub; ++i){
725 358 : int64_t term = self->buf[i-1] + g_tbl->buf[i];
726 358 : self->buf[i] = m ? nut_i64_mod(term, m) : term;
727 : }
728 2303 : for(int64_t i = i_ub; i <= self->y; ++i){
729 2126 : int64_t F = self->buf[i];
730 2126 : int64_t G = self->buf[i-1] + g_tbl->buf[i];
731 2126 : if(m){
732 2032 : G = nut_i64_mod(G, m);
733 : }
734 2126 : self->buf[i] = G;
735 2126 : int64_t j_ub = self->x/(i*i) + 1;
736 2126 : if(j_ub > self->yinv){
737 177 : j_ub = self->yinv;
738 : }
739 4780 : for(int64_t j = self->x/((i + 1)*(i + 1)) + 1; j < j_ub; ++j){
740 2654 : int64_t h = nut_Diri_get_sparse(self, j);
741 2654 : if(m){
742 2540 : h = nut_i64_mod(h - F*G, m);
743 : }else{
744 114 : h -= F*G;
745 : }
746 2654 : nut_Diri_set_sparse(self, j, h);
747 : }
748 : }
749 2831 : for(int64_t i = 1; i < self->yinv; ++i){
750 2654 : int64_t v = self->x/i;
751 2654 : int64_t vr = nut_u64_nth_root(v, 2);
752 2654 : int64_t h = nut_Diri_get_sparse(self, i);
753 18195 : for(int64_t n = 1, term; n <= vr; ++n){
754 15541 : if(v/n <= self->y){
755 8266 : term = nut_Diri_get_dense(f_tbl, n)*nut_Diri_get_dense(self, v/n);
756 : }else{
757 7275 : term = nut_Diri_get_dense(f_tbl, n)*nut_Diri_get_sparse(g_tbl, i*n);
758 : }
759 15541 : if(m){
760 14710 : h = nut_i64_mod(h + term, m);
761 : }else{
762 831 : h += term;
763 : }
764 : }
765 2654 : nut_Diri_set_sparse(self, i, h);
766 : }
767 177 : return nut_euler_sieve_conv(self->y, m, f_tbl->buf, g_tbl->buf, self->buf);
768 : }
769 :
770 2 : bool nut_Diri_convdiv(nut_Diri *restrict self, int64_t m, const nut_Diri *restrict f_tbl, const nut_Diri *restrict g_tbl){
771 : // Here we want to find the diri table for h where f = g <*> h. We recall the hyperbola formula again:
772 : // F(v) = sum(n = 1 ... vr, g(n)H(v/n)) + sum(n = 1 ... vr, G(v/n)h(n)) - G(vr)H(vr)
773 : // We can split out the n = 1 term from the first sum and rearrange to get
774 : // H(v) = F(v) + G(vr)H(vr) - sum(n = 2 ... vr, g(n)H(v/n)) - sum(n = 1 ... vr, G(v/n)h(n))
775 : // Now we want to be able to evaluate the H(v) values from smallest v to largest.
776 : // In the Diri_compute_conv* functions, the two sums and the correction term all depend only on the input functions,
777 : // so we can store partial sums of the input functions for low "dense" values in the dense part of the output.
778 : // Here, that isn't possible, due to the dependence of H(v) on smaller H(v). So we must use a temporary array.
779 2 : if(self->y != f_tbl->y || self->x != f_tbl->x || self->y != g_tbl->y || self->x != g_tbl->x){
780 0 : return false;
781 : }
782 4 : int64_t *H_dense [[gnu::cleanup(cleanup_free)]] = malloc((self->y + 1)*sizeof(int64_t));
783 4 : int64_t *G_dense [[gnu::cleanup(cleanup_free)]] = malloc((self->y + 1)*sizeof(int64_t));
784 2 : if(!H_dense || !G_dense){
785 : return false;
786 : }
787 2 : memset(self->buf, 0, (self->y + 1)*sizeof(int64_t));
788 2 : H_dense[0] = G_dense[0] = 0;
789 : // First, we "sieve" the values of h into the dense part of the output
790 22 : for(int64_t i = 1; i <= self->y; ++i){
791 20 : int64_t term = self->buf[i] + f_tbl->buf[i];
792 20 : self->buf[i] = m ? nut_i64_mod(term, m) : term;
793 20 : if(!self->buf[i]){
794 0 : continue;
795 : }
796 54 : for(int64_t j = 2; j <= self->y/i; ++j){
797 34 : int64_t term = self->buf[i*j] - self->buf[i]*g_tbl->buf[j];
798 34 : self->buf[i*j] = m ? nut_i64_mod(term, m) : term;
799 : }
800 : }
801 : // Now, accumulate the sums of dense h values into H_dense
802 22 : for(int64_t i = 1; i <= self->y; ++i){
803 20 : int64_t term = self->buf[i] + H_dense[i - 1];
804 20 : H_dense[i] = m ? nut_i64_mod(term, m) : term;
805 : }
806 : // And accumulate the sums of dense g values into G_dense
807 22 : for(int64_t i = 1; i <= self->y; ++i){
808 20 : int64_t term = g_tbl->buf[i] + G_dense[i - 1];
809 20 : G_dense[i] = m ? nut_i64_mod(term, m) : term;
810 : }
811 : // Now we populate the sparse H values using the formula
812 : // H(v) = F(v) + G(vr)H(vr) - sum(n = 2 ... vr, g(n)H(v/n)) - sum(n = 1 ... vr, G(v/n)h(n))
813 : // We will do the sums first for each v, because when doing a modulus, subtraction is more expensive than addition
814 : // This requires H(v/2), ..., H(vr) to have been computed before, which we can accomplish by computing
815 : // H in terms of increasing values of v...
816 : // So i is decreasing in this loop...
817 22 : for(int64_t i = self->yinv - 1; i >= 1; --i){
818 : // so that v will be increasing here
819 20 : int64_t v = self->x/i;
820 20 : int64_t vr = nut_u64_nth_root(v, 2);
821 20 : int64_t H = 0; // H starts at 0 because we add the sums first
822 20 : if(1 <= vr){ // do the extra term of G(v/n)h(n) where n = 1
823 : // Because i < self->yinv, v >= self->y by definition of self->yinv,
824 : // So it is only possible for v to be <= self->y at most once (if it equals self->y when i is self->yinv - 1).
825 : // But if self->y == self->yinv - 1, we store both h(self->y) in the dense part and H(self->yinv - 1) in the sparse part.
826 : // Therefore, the value we want always exists in the sparse part here
827 20 : assert(v >= self->y);
828 20 : H = nut_Diri_get_sparse(g_tbl, i);
829 : }
830 94 : for(int64_t n = 2, gH_term, Gh_term; n <= vr; ++n){
831 74 : if(v/n <= self->y){
832 46 : gH_term = nut_Diri_get_dense(g_tbl, n)*H_dense[v/n];
833 46 : Gh_term = nut_Diri_get_dense(self, n)*G_dense[v/n];
834 : }else{
835 28 : gH_term = nut_Diri_get_dense(g_tbl, n)*nut_Diri_get_sparse(self, i*n);
836 28 : Gh_term = nut_Diri_get_dense(self, n)*nut_Diri_get_sparse(g_tbl, i*n);
837 : }
838 74 : H = m ? nut_i64_mod(H + gH_term%m + Gh_term%m, m) : H + gH_term + Gh_term;
839 : }
840 : // Now H consists of all the sums, so we have to take it and subtract it from F(v) + G(vr)H(vr)
841 20 : int64_t term = G_dense[vr]*H_dense[vr];
842 20 : assert(v >= self->y); // see above comment about i < self->yinv
843 20 : term = m ? nut_Diri_get_sparse(f_tbl, i) + term%m : nut_Diri_get_sparse(f_tbl, i) + term;
844 20 : H = m ? nut_i64_mod(term - H, m) : term - H;
845 20 : nut_Diri_set_sparse(self, i, H);
846 : }
847 : return true;
848 : }
849 :
|