Skip to content

Commit

Permalink
Optimize M2P
Browse files Browse the repository at this point in the history
  • Loading branch information
isuruf committed Jul 27, 2023
1 parent 19531d0 commit 6cd2ee6
Show file tree
Hide file tree
Showing 3 changed files with 224 additions and 13 deletions.
31 changes: 19 additions & 12 deletions sumpy/e2p.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,9 @@ class E2PFromSingleBox(E2PBase):
def default_name(self):
return "e2p_from_single_box"

def get_kernel(self, max_ntargets_in_one_box):
def get_kernel(self, max_ntargets_in_one_box, max_work_items):
ncoeffs = len(self.expansion)
loopy_args = self.get_loopy_args()
max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box))

loopy_knl = lp.make_kernel(
[
Expand Down Expand Up @@ -208,11 +207,16 @@ def get_kernel(self, max_ntargets_in_one_box):
return loopy_knl

def get_optimized_kernel(self, max_ntargets_in_one_box):
inner_knl, optimizations = self.get_loopy_evaluator_and_optimizations()
knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box)
inner_knl, optimizations = self.get_cached_loopy_knl_and_optimizations()

ncoeffs = len(self.expansion)
max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box))
knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box,
max_work_items=max_work_items)

knl = lp.tag_inames(knl, {"itgt_box": "g.0"})
knl = lp.split_iname(knl, "itgt_offset", 256, inner_tag="l.0")
knl = lp.split_iname(knl, "icoeff", 256, inner_tag="l.0")
knl = lp.split_iname(knl, "itgt_offset", max_work_items, inner_tag="l.0")
knl = lp.split_iname(knl, "icoeff", max_work_items, inner_tag="l.0")
knl = lp.add_inames_to_insn(knl, "dummy",
"id:fetch_init* or id:fetch_center or id:kernel_scaling")
knl = lp.add_inames_to_insn(knl, "itgt_box", "id:kernel_scaling")
Expand Down Expand Up @@ -273,10 +277,9 @@ class E2PFromCSR(E2PBase):
def default_name(self):
return "e2p_from_csr"

def get_kernel(self, max_ntargets_in_one_box):
def get_kernel(self, max_ntargets_in_one_box, max_work_items):
ncoeffs = len(self.expansion)
loopy_args = self.get_loopy_args()
max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box))

loopy_knl = lp.make_kernel(
[
Expand Down Expand Up @@ -378,13 +381,17 @@ def get_kernel(self, max_ntargets_in_one_box):
return loopy_knl

def get_optimized_kernel(self, max_ntargets_in_one_box):
_, optimizations = self.get_loopy_evaluator_and_optimizations()
knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box)
_, optimizations = self.get_cached_loopy_knl_and_optimizations()
ncoeffs = len(self.expansion)
max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box))

knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box,
max_work_items=max_work_items)
knl = lp.tag_inames(knl, {"itgt_box": "g.0", "dummy": "l.0"})
knl = lp.unprivatize_temporaries_with_inames(knl,
"itgt_offset", "result_temp")
knl = lp.split_iname(knl, "itgt_offset", 256, inner_tag="l.0")
knl = lp.split_iname(knl, "icoeff", 256, inner_tag="l.0")
knl = lp.split_iname(knl, "itgt_offset", max_work_items, inner_tag="l.0")
knl = lp.split_iname(knl, "icoeff", max_work_items, inner_tag="l.0")
knl = lp.privatize_temporaries_with_inames(knl,
"itgt_offset_outer", "result_temp")
knl = lp.duplicate_inames(knl, "itgt_offset_outer", "id:init_result")
Expand Down
189 changes: 188 additions & 1 deletion sumpy/expansion/loopy.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
import loopy as lp
import numpy as np
from sumpy.expansion import ExpansionBase
from sumpy.kernel import Kernel
from sumpy.kernel import Kernel, LaplaceKernel
import sumpy.symbolic as sym
from sumpy.assignment_collection import SymbolicAssignmentCollection
from sumpy.tools import gather_loopy_arguments, gather_loopy_source_arguments
Expand Down Expand Up @@ -134,6 +134,193 @@ def make_e2p_loopy_kernel(expansion: ExpansionBase, kernels: Sequence[Kernel]) \
return loopy_knl, optimizations


def make_m2p_loopy_kernel_for_volume_taylor(
expansion: ExpansionBase, kernels: Sequence[Kernel]) -> lp.TranslationUnit:

#raise NotImplementedError()
if len(kernels) > 1:
raise NotImplementedError()

kernel = kernels[0]
if not isinstance(kernel, LaplaceKernel):
raise NotImplementedError()

dim = expansion.dim
order = expansion.order

if dim != 3 or order < 2:
raise NotImplementedError()

b = pymbolic.var("b")
ncoeffs = len(expansion.get_coefficient_identifiers())

temp = pymbolic.var("temp")
inv_r = pymbolic.var("inv_r")
inv_r2 = pymbolic.var("inv_r2")

domains = [
"{[idim]: 0<=idim<dim}",
]
insns = [
lp.Assignment(
assignee="b[idim]",
expression="(target[idim]-center[idim])/rscale",
temp_var_type=lp.Optional(None),
),
lp.Assignment(
assignee=inv_r,
expression="rsqrt(b[0]*b[0] + b[1]*b[1] + b[2]* b[2])",
temp_var_type=lp.Optional(None),
),
lp.Assignment(
id="inv_r2",
assignee=inv_r2,
expression=inv_r*inv_r,
temp_var_type=lp.Optional(None),
),
]

init_exprs = {}
init_exprs[(0, 0, 0)] = inv_r

# Order 1 expressions
for i in range(dim):
mi = [0]*dim
mi[i] = 1
init_exprs[tuple(mi)] = -inv_r * b[i]

# Order 2 expressions
for i in range(dim):
for j in range(dim):
mi = [0]*dim
mi[i] = 1
mi[j] = 1
init_exprs[tuple(mi)] = inv_r*inv_r2*inv_r2*3*b[i]*b[j]

# Order 3 expressions
init_exprs[(1, 1, 1)] = -15 * inv_r * inv_r2 * inv_r2 * inv_r2 \
* b[0] * b[1] * b[2]

depends_on = frozenset(["inv_r2"])
for i in range(2):
for j in range(2):
for k in range(2):
insn = lp.Assignment(
id=f"init_{i}_{j}_{k}",
assignee=temp[i, j, k],
expression=init_exprs[(i, j, k)],
depends_on=depends_on,
)
depends_on = frozenset([f"init_{i}_{j}_{k}"])
insns.append(insn)

wrangler = expansion.expansion_terms_wrangler
result = pymbolic.var("result")
coeffs = pymbolic.var("coeffs")

# For x1 == 0, 1
x2 = pymbolic.var("x2")
x1 = pymbolic.var("x1")
x0 = pymbolic.var("x0")
domains += [
f"{{[{x1}]: 0<={x1}<=1}}",
f"{{[{x0}]: 0<={x0}<=1 and {x0}<order-{x1} }}",
f"{{[{x2}]: 0<={x2}<=order-{x0}-{x1} }}",
]
expr = (2*x2 - 1) * b[2] * temp[x0, x1, x2 - 1] + (x2 - 1)*(x2 - 1) \
* temp[x0, x1, x2 - 2]
expr += prim.If(prim.Comparison(x0, ">=", 1),
2*x0*b[0]*temp[x0 - 1, x1, x2], 0)
expr += prim.If(prim.Comparison(x1, ">=", 1),
2*x1*b[1]*temp[x0, x1 - 1, x2], 0)
expr *= -inv_r2
insns += [
lp.Assignment(
id=f"temp_{x0}_{x1}_{x2}",
assignee=temp[x0, x1, x2],
expression=-inv_r2,
depends_on=depends_on,
predicates=frozenset([prim.Comparison(x2, ">=", 2)]),
),
lp.Assignment(
id=f"update_{x0}_{x1}_{x2}",
assignee=result[0],
expression=(result[0] + coeffs[wrangler.get_storage_index([x0, x1, x2])]
* temp[x0, x1, x2]),
depends_on=frozenset([f"temp_{x0}_{x1}_{x2}"])
)
]
depends_on = frozenset([f"update_{x0}_{x1}_{x2}"])

# For x1 >=2
x2 = pymbolic.var("y2")
x1 = pymbolic.var("y1")
x0 = pymbolic.var("y0")
domains += [
f"{{[{x1}]: 2<={x1}<=order}}",
f"{{[{x0}]: 0<={x0}<=1 and {x0}<order-{x1} }}",
f"{{[{x2}]: 0<={x2}<=order-{x0}-{x1} }}",
]
expr = ((2*x1 - 1) * b[1] * temp[x0, (x1 - 1) % 2, x2] + (x1 - 1)*(x1 - 1)
* temp[x0, (x1 - 2) % 2, x2])
expr += prim.If(prim.Comparison(x0, ">=", 1),
2*x0*b[0]*temp[x0 - 1, x1 % 2, x2], 0)
expr += prim.If(prim.Comparison(x2, ">=", 1),
2*x2*b[2]*temp[x0, x1 % 2, x2 - 1], 0)
expr += prim.If(prim.Comparison(x2, ">=", 2),
2*b[2]*temp[x0, x1 % 2, x2 - 1], 0)
expr *= -inv_r2
insns += [
lp.Assignment(
id=f"temp_{x0}_{x1}_{x2}",
assignee=temp[x0, x1 % 2, x2],
expression=-inv_r2,
depends_on=depends_on,
),
lp.Assignment(
id=f"update_{x0}_{x1}_{x2}",
assignee=result[0],
expression=(result[0] + coeffs[wrangler.get_storage_index([x0, x1, x2])]
* temp[x0, x1 % 2, x2]),
depends_on=frozenset([f"temp_{x0}_{x1}_{x2}"])
)
]
depends_on = frozenset([f"update_{x0}_{x1}_{x2}"])

loopy_knl = lp.make_function(domains, insns,
kernel_data=[
lp.GlobalArg("result", shape=(len(kernels),), is_input=True,
is_output=True),
lp.GlobalArg("coeffs",
shape=(ncoeffs,), is_input=True, is_output=False),
lp.GlobalArg("center, target",
shape=(dim,), is_input=True, is_output=False),
lp.ValueArg("rscale", is_input=True),
lp.ValueArg("itgt", is_input=True),
lp.ValueArg("ntargets", is_input=True),
lp.GlobalArg("targets",
shape=(dim, "ntargets"), is_input=True, is_output=False),
lp.TemporaryVariable("temp"),
...],
name="e2p",
lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
fixed_parameters={"dim": dim, "order": order},
)

loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
loopy_knl = lp.tag_inames(loopy_knl, "x0*:unr")
loopy_knl = lp.tag_inames(loopy_knl, "x1*:unr")
loopy_knl = lp.tag_inames(loopy_knl, "y0*:unr")
for kernel in kernels:
loopy_knl = kernel.prepare_loopy_kernel(loopy_knl)

loopy_knl = lp.simplify_indices(loopy_knl)

optimizations = []

return loopy_knl, optimizations


def make_p2e_loopy_kernel(
expansion: ExpansionBase, kernels: Sequence[Kernel],
strength_usage: Sequence[int], nstrengths: int) -> lp.TranslationUnit:
Expand Down
17 changes: 17 additions & 0 deletions sumpy/expansion/multipole.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@
VolumeTaylorExpansionMixin,
LinearPDEConformingVolumeTaylorExpansion)
from sumpy.tools import mi_set_axis, add_to_sac, mi_power, mi_factorial
from sumpy.kernel import Kernel

import loopy as lp

from typing import Sequence

import logging
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -103,6 +108,18 @@ def evaluate(self, kernel, coeffs, bvec, rscale, sac=None):
result = sym.Add(*tuple(result))
return result

def get_loopy_evaluator(self, kernels: Sequence[Kernel]) -> lp.TranslationUnit:
"""
:returns: a :mod:`loopy` kernel that returns the evaluated
target transforms of the potential given by *kernels*.
"""
from sumpy.expansion.loopy import (make_e2p_loopy_kernel,
make_m2p_loopy_kernel_for_volume_taylor)
try:
return make_m2p_loopy_kernel_for_volume_taylor(self, kernels)
except NotImplementedError:
return make_e2p_loopy_kernel(self, kernels)

def translate_from(self, src_expansion, src_coeff_exprs, src_rscale,
dvec, tgt_rscale, sac=None, _fast_version=True):
if not isinstance(src_expansion, type(self)):
Expand Down

0 comments on commit 6cd2ee6

Please sign in to comment.