Skip to content

Commit

Permalink
[Lang] Fix error with loop unique analysis for MatrixPtrStmt (#8307)
Browse files Browse the repository at this point in the history
Issue: fix #8301

### Brief Summary

<!--
copilot:summary
-->
### <samp>🤖 Generated by Copilot at 6f3204e</samp>

This pull request enhances the performance and functionality of matrix
operations in Taichi by fixing a bug in the loop unique analysis for
matrix pointers, adding support for implicit FEM and SVD, and adding new
test cases to verify the correctness and effectiveness of the changes.
The files `taichi/analysis/gather_uniquely_accessed_pointers.cpp`,
`tests/python/test_implicit_fem.py`, and `tests/python/test_matrix.py`
are modified accordingly.

### Walkthrough

<!--
copilot:walkthrough
-->
### <samp>🤖 Generated by Copilot at 6f3204e</samp>

* Fix a bug in the loop unique analysis for matrix pointers that did not
handle loop invariant offsets
([link](https://github.com/taichi-dev/taichi/pull/8307/files?diff=unified&w=0#diff-b0f7fa0caaec4625e20b1a8e166aa5f16ee7b7071a7e3ecbb255e9c1bcc0cb2eL117-R119))
  • Loading branch information
jim19930609 authored Aug 4, 2023
1 parent 125c8e5 commit 21fc9f9
Show file tree
Hide file tree
Showing 3 changed files with 327 additions and 1 deletion.
4 changes: 3 additions & 1 deletion taichi/analysis/gather_uniquely_accessed_pointers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,9 @@ class LoopUniqueStmtSearcher : public BasicStmtVisitor {
return true;
}

if (loop_unique_.find(stmt->offset) != loop_unique_.end()) {
if (loop_invariant_.find(stmt->offset) != loop_invariant_.end()) {
ret &= true;
} else if (loop_unique_.find(stmt->offset) != loop_unique_.end()) {
ret &= (loop_unique_.find(stmt->offset)->second == -1);
} else {
return false;
Expand Down
309 changes: 309 additions & 0 deletions tests/python/test_implicit_fem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,309 @@
import numpy as np
import pytest

import taichi as ti
from tests import test_utils


@test_utils.test(arch=[ti.cpu, ti.cuda])
def test_implicit_fem():
E, nu = 5e4, 0.0
mu, la = E / (2 * (1 + nu)), E * nu / ((1 + nu) * (1 - 2 * nu)) # lambda = 0
density = 1000.0
dt = 2e-4

use_sparse = 0

n_cube = np.array([5] * 3)
n_verts = np.product(n_cube)
n_cells = 5 * np.product(n_cube - 1)
dx = 1 / (n_cube.max() - 1)

F_vertices = ti.Vector.field(4, dtype=ti.i32, shape=n_cells)

F_x = ti.Vector.field(3, dtype=ti.f32, shape=n_verts)
F_ox = ti.Vector.field(3, dtype=ti.f32, shape=n_verts)
F_v = ti.Vector.field(3, dtype=ti.f32, shape=n_verts)
F_f = ti.Vector.field(3, dtype=ti.f32, shape=n_verts)
F_mul_ans = ti.Vector.field(3, dtype=ti.f32, shape=n_verts)
F_m = ti.field(dtype=ti.f32, shape=n_verts)

n_cells = (n_cube - 1).prod() * 5
F_B = ti.Matrix.field(3, 3, dtype=ti.f32, shape=n_cells)
F_W = ti.field(dtype=ti.f32, shape=n_cells)

@ti.func
def i2p(I):
return (I.x * n_cube[1] + I.y) * n_cube[2] + I.z

@ti.func
def set_element(e, I, verts):
for i in ti.static(range(3 + 1)):
F_vertices[e][i] = i2p(I + (([verts[i] >> k for k in range(3)] ^ I) & 1))

@ti.kernel
def get_vertices():
"""
This kernel partitions the cube into tetrahedrons.
Each unit cube is divided into 5 tetrahedrons.
"""
for I in ti.grouped(ti.ndrange(*(n_cube - 1))):
e = ((I.x * (n_cube[1] - 1) + I.y) * (n_cube[2] - 1) + I.z) * 5
for i, j in ti.static(enumerate([0, 3, 5, 6])):
set_element(e + i, I, (j, j ^ 1, j ^ 2, j ^ 4))
set_element(e + 4, I, (1, 2, 4, 7))
for I in ti.grouped(ti.ndrange(*(n_cube))):
F_ox[i2p(I)] = I * dx

@ti.func
def Ds(verts):
return ti.Matrix.cols([F_x[verts[i]] - F_x[verts[3]] for i in range(3)])

@ti.func
def ssvd(F):
U, sig, V = ti.svd(F)
if U.determinant() < 0:
for i in ti.static(range(3)):
U[i, 2] *= -1
sig[2, 2] = -sig[2, 2]
if V.determinant() < 0:
for i in ti.static(range(3)):
V[i, 2] *= -1
sig[2, 2] = -sig[2, 2]
return U, sig, V

@ti.func
def get_force_func(c, verts):
F = Ds(verts) @ F_B[c]
P = ti.Matrix.zero(ti.f32, 3, 3)
U, sig, V = ssvd(F)
P = 2 * mu * (F - U @ V.transpose())
H = -F_W[c] * P @ F_B[c].transpose()
for i in ti.static(range(3)):
force = ti.Vector([H[j, i] for j in range(3)])
F_f[verts[i]] += force
F_f[verts[3]] -= force

@ti.kernel
def get_force():
for c in F_vertices:
get_force_func(c, F_vertices[c])
for u in F_f:
F_f[u].y -= 9.8 * F_m[u]

@ti.kernel
def matmul_cell(ret: ti.template(), vel: ti.template()):
for i in ret:
ret[i] = vel[i] * F_m[i]
for c in F_vertices:
verts = F_vertices[c]
W_c = F_W[c]
B_c = F_B[c]
for u in range(4):
for d in range(3):
dD = ti.Matrix.zero(ti.f32, 3, 3)
if u == 3:
for j in range(3):
dD[d, j] = -1
else:
dD[d, u] = 1
dF = dD @ B_c
dP = 2.0 * mu * dF
dH = -W_c * dP @ B_c.transpose()
for i in range(3):
for j in range(3):
tmp = vel[verts[i]][j] - vel[verts[3]][j]
ret[verts[u]][d] += -(dt**2) * dH[j, i] * tmp

@ti.kernel
def add(ans: ti.template(), a: ti.template(), k: ti.f32, b: ti.template()):
for i in ans:
ans[i] = a[i] + k * b[i]

@ti.kernel
def dot(a: ti.template(), b: ti.template()) -> ti.f32:
ans = 0.0
for i in a:
ans += a[i].dot(b[i])
return ans

F_b = ti.Vector.field(3, dtype=ti.f32, shape=n_verts)
F_r0 = ti.Vector.field(3, dtype=ti.f32, shape=n_verts)
F_p0 = ti.Vector.field(3, dtype=ti.f32, shape=n_verts)

# ndarray version of F_b
F_b_ndarr = ti.ndarray(dtype=ti.f32, shape=3 * n_verts)
# stiffness matrix
A_builder = ti.linalg.SparseMatrixBuilder(3 * n_verts, 3 * n_verts, max_num_triplets=50000)
solver = ti.linalg.SparseSolver(ti.f32, "LLT")

@ti.kernel
def get_b():
for i in F_b:
F_b[i] = F_m[i] * F_v[i] + dt * F_f[i]

def cg():
def mul(x):
matmul_cell(F_mul_ans, x)
return F_mul_ans

get_force()
get_b()
mul(F_v)
add(F_r0, F_b, -1, mul(F_v))

d = F_p0
d.copy_from(F_r0)
r_2 = dot(F_r0, F_r0)
n_iter = 50
epsilon = 1e-6
r_2_init = r_2
r_2_new = r_2
for _ in range(n_iter):
q = mul(d)
alpha = r_2_new / dot(d, q)
add(F_v, F_v, alpha, d)
add(F_r0, F_r0, -alpha, q)
r_2 = r_2_new
r_2_new = dot(F_r0, F_r0)
if r_2_new <= r_2_init * epsilon**2:
break
beta = r_2_new / r_2
add(d, F_r0, beta, d)
F_f.fill(0)
add(F_x, F_x, dt, F_v)

@ti.kernel
def compute_A(A: ti.types.sparse_matrix_builder()):
# A = M - dt * dt * K
for i in range(n_verts):
for j in range(3):
A[3 * i + j, 3 * i + j] += F_m[i]
for c in F_vertices:
verts = F_vertices[c]
W_c = F_W[c]
B_c = F_B[c]
for u in range(4):
for d in range(3):
dD = ti.Matrix.zero(ti.f32, 3, 3)
if u == 3:
for j in range(3):
dD[d, j] = -1
else:
dD[d, u] = 1
dF = dD @ B_c
dP = 2.0 * mu * dF
dH = -W_c * dP @ B_c.transpose()
for i in range(3):
for j in range(3):
A[3 * verts[u] + d, 3 * verts[i] + j] += -(dt**2) * dH[j, i]
for i in range(3):
A[3 * verts[u] + d, 3 * verts[3] + i] += -(dt**2) * (-dH[i, 0] - dH[i, 1] - dH[i, 2])

@ti.kernel
def flatten(dest: ti.types.ndarray(), src: ti.template()):
for i in range(n_verts):
for j in range(3):
dest[3 * i + j] = src[i][j]

@ti.kernel
def aggragate(dest: ti.template(), src: ti.types.ndarray()):
for i in range(n_verts):
for j in range(3):
dest[i][j] = src[3 * i + j]

def direct():
get_force()
get_b()
flatten(F_b_ndarr, F_b)
v = solver.solve(F_b_ndarr)
aggragate(F_v, v)
F_f.fill(0)
add(F_x, F_x, dt, F_v)

@ti.kernel
def advect():
for p in F_x:
F_v[p] += dt * (F_f[p] / F_m[p])
F_x[p] += dt * F_v[p]
F_f[p] = ti.Vector([0, 0, 0])

@ti.kernel
def init():
for u in F_x:
F_x[u] = F_ox[u]
F_v[u] = [0.0] * 3
F_f[u] = [0.0] * 3
F_m[u] = 0.0
for c in F_vertices:
F = Ds(F_vertices[c])
F_B[c] = F.inverse()
F_W[c] = ti.abs(F.determinant()) / 6
for i in range(4):
F_m[F_vertices[c][i]] += F_W[c] / 4 * density
# for u in F_x:
# F_x[u].y += 1.0

def init_A():
compute_A(A_builder)
A = A_builder.build()
solver.analyze_pattern(A)
solver.factorize(A)

@ti.kernel
def floor_bound():
for u in F_x:
if F_x[u].y < 0:
F_x[u].y = 0
if F_v[u].y < 0:
F_v[u].y = 0

@ti.func
def check(u):
ans = 0
rest = u
for i in ti.static(range(3)):
k = rest % n_cube[2 - i]
rest = rest // n_cube[2 - i]
if k == 0:
ans |= 1 << (i * 2)
if k == n_cube[2 - i] - 1:
ans |= 1 << (i * 2 + 1)
return ans

def gen_indices():
su = 0
for i in range(3):
su += (n_cube[i] - 1) * (n_cube[(i + 1) % 3] - 1)
return ti.field(ti.i32, shape=2 * su * 2 * 3)

indices = gen_indices()

@ti.kernel
def get_indices():
# calculate all the meshes on surface
cnt = 0
for c in F_vertices:
if c % 5 != 4:
for i in ti.static([0, 2, 3]):
verts = [F_vertices[c][(i + j) % 4] for j in range(3)]
sum_ = check(verts[0]) & check(verts[1]) & check(verts[2])
if sum_:
m = ti.atomic_add(cnt, 1)
det = ti.Matrix.rows([F_x[verts[i]] - [0.5, 1.5, 0.5] for i in range(3)]).determinant()
if det < 0:
tmp = verts[1]
verts[1] = verts[2]
verts[2] = tmp
indices[m * 3] = verts[0]
indices[m * 3 + 1] = verts[1]
indices[m * 3 + 2] = verts[2]

def substep():
cg()
floor_bound()

get_vertices()
init()
get_indices()
substep()
15 changes: 15 additions & 0 deletions tests/python/test_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -1375,3 +1375,18 @@ def test() -> ti.f32:
return length(v)

approx(test(), 5.477226)


@test_utils.test()
def test_matrix_loop_unique():
F_x = ti.Vector.field(3, dtype=ti.f32, shape=10)

@ti.kernel
def init():
for u in F_x:
F_x[u][1] += 1.0

init()

for u in range(10):
assert F_x[u][1] == 1.0

0 comments on commit 21fc9f9

Please sign in to comment.