mruberry
diff --git a/‎.clang-format
+88 b/‎.clang-format
+88
diff --git a/‎benchmarks/basic.py
+24 b/‎benchmarks/basic.py
+24
diff --git a/‎benchmarks/jit_apply.py
+73 b/‎benchmarks/jit_apply.py
+73
diff --git a/‎benchmarks/nearest_neighbors.py
+165 b/‎benchmarks/nearest_neighbors.py
+165
@@ -0,0 +1,88 @@
+---
+AccessModifierOffset: -1
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ForEachMacros:   [ FOR_EACH_RANGE, FOR_EACH, ]
+IncludeCategories:
+  - Regex:           '^<.*\.h(pp)?>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 2000000
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
+...
@@ -0,0 +1,24 @@
+from nestedtensor import torch
+import utils
+
+import random
+
+
+def gen_list_nested_tensor_construction():
+    tensors = [torch.rand(random.randint(500, 1500), 25600) for _ in range(20)]
+    def _algorithm():
+        nt = torch._ListNestedTensor(tensors)
+    return _algorithm
+
+def gen_list_nested_tensor_unbind():
+    nested_tensor = torch._ListNestedTensor([torch.rand(random.randint(500, 1500), 25600) for _ in range(20)])
+    def _algorithm():
+        ts = nested_tensor.unbind()
+    return _algorithm
+
+if __name__ == "__main__":
+    # print(utils.benchmark_fn(alg, use_cprofile=True))
+    # alg = gen_list_nested_tensor_construction()
+    # print(utils.benchmark_fn(alg))
+    alg = gen_list_nested_tensor_unbind()
+    print(utils.benchmark_fn(alg))
@@ -0,0 +1,73 @@
+from nestedtensor import torch
+import nestedtensor
+import utils
+
+
+def vmap(fn):
+    def decorator(arg):
+        if torch.is_tensor(arg):
+            return fn(arg)
+        else:
+            def asd(x):
+                return fn(x)
+            return arg.jit_apply(torch.jit.script(asd))
+    return decorator
+
+
+@torch.jit.script
+def my_fun(x):
+    x = x + 1
+    y = x.abs()
+    return y
+
+# print(e)
+
+
+def gen_current():
+    n = torch.as_nested_tensor(
+        [torch.randn(256, 128).to(device='cuda') for _ in range(128)])
+
+    def _algorithm():
+        n1 = n + 1
+        n1.abs()
+
+    return _algorithm
+
+
+def gen_jit():
+
+    n = nestedtensor._ListNestedTensor(
+        [torch.randn(256, 128).to(device='cuda') for _ in range(128)])
+
+    def gen_my_fun(scalar, tensor):
+        @torch.jit.ignore
+        def get_scalar() -> float:
+            return scalar
+
+        @torch.jit.ignore
+        def get_tensor() -> torch.Tensor:
+            return tensor
+
+        @torch.jit.script
+        def my_fun(x, y):
+            x = x + get_scalar()
+            x = x + get_tensor()
+            y = y + x.abs()
+            return y
+        return my_fun
+    my_fun = gen_my_fun(3.0, torch.randn(1).to(device='cuda'))
+
+    def _algorithm():
+        nestedtensor._C.jit_apply_function((n, n), my_fun)
+
+    return _algorithm
+
+
+if __name__ == "__main__":
+    # print(utils.benchmark_fn(alg, use_cprofile=True))
+    # alg = gen_list_nested_tensor_construction()
+    # print(utils.benchmark_fn(alg))
+    alg1 = gen_current()
+    print(utils.benchmark_fn(alg1))
+    alg2 = gen_jit()
+    print(utils.benchmark_fn(alg2))
@@ -0,0 +1,165 @@
+from nestedtensor import torch
+import nestedtensor
+import argparse
+import time
+import random
+import pprint
+
+EMBED_DIM = 1024
+
+SEED = 0
+
+
+def gen_tensor():
+    globals()['SEED'] += 1
+    # return torch.tensor([globals()['SEED']])
+    return torch.rand(EMBED_DIM).to(device='cuda')
+
+
+def gen_clusters(num_clusters, size_range):
+
+    def gen_cluster(num_entries):
+        return [gen_tensor() for _ in range(num_entries)]
+
+    return [gen_cluster(random.randint(*size_range)) for _ in range(num_clusters)]
+
+
+def gen_algorithm_naive(keys, sub_clusters):
+    # For-loops over vectors
+    def _naive():
+        results = []
+        for sub_cluster, key in zip(sub_clusters, keys):
+            sub_cluster_results = []
+            for cluster in sub_cluster:
+                sub_cluster_results.append(
+                    [torch.dot(key, entry).item() for entry in cluster])
+            results.append(sub_cluster_results)
+        return results
+    return _naive
+
+def gen_algorithm_mv(keys, sub_clusters):
+    # For-loops over vectors and matrices
+    new_sub_clusters = []
+    for sub_cluster in sub_clusters:
+        new_sub_cluster = [torch.stack(cluster) for cluster in sub_cluster]
+        new_sub_clusters.append(new_sub_cluster)
+    sub_clusters = new_sub_clusters
+    def _mv():
+        results = []
+        for sub_cluster, key in zip(sub_clusters, keys):
+            sub_cluster_results = []
+            for cluster in sub_cluster:
+                sub_cluster_results.append(torch.mv(cluster, key))
+            results.append(sub_cluster_results)
+        return results
+    return _mv
+
+def gen_algorithm_nested_mv(keys, sub_clusters):
+    # For-loops over vectors and matrices
+    new_sub_clusters = []
+    for sub_cluster in sub_clusters:
+        new_sub_cluster = [torch.tensor(list(map(list, cluster))) for cluster in sub_cluster]
+        new_sub_clusters.append(new_sub_cluster)
+    nested_sub_clusters = torch.nested_tensor(sub_clusters).to_tensor(2)
+    nested_keys = torch.nested_tensor(keys)
+    def _nested_mv():
+        return torch.mv(nested_sub_clusters, nested_keys)
+    return _nested_mv
+
+def gen_algorithm_nested_jit_mv(keys, sub_clusters):
+    # For-loops over vectors and matrices
+    new_sub_clusters = []
+    for sub_cluster in sub_clusters:
+        new_sub_cluster = []
+        for cluster in sub_cluster:
+            new_sub_cluster.append(torch.stack(cluster))
+        new_sub_clusters.append(new_sub_cluster)
+    nested_sub_clusters = nestedtensor._ListNestedTensor(new_sub_clusters)
+    print("HERE")
+    print(nested_sub_clusters.nested_size())
+    nested_keys = nestedtensor._ListNestedTensor(keys)
+    print(nested_keys.nested_size())
+
+    @torch.jit.script
+    def my_fun(x, y):
+        return torch.mv(x, y)
+
+    def _nested_jit_mv():
+        return nestedtensor._C.jit_apply_function((nested_sub_clusters, nested_keys), my_fun)
+    return _nested_jit_mv
+
+
+def print_results(results, keys, sub_clusters, print_details=False):
+    if print_details:
+        for i, sub_cluster in enumerate(sub_clusters):
+            print("\n\u001b[31msub cluster {} count {} total number of entries {}\u001b[0m".format(
+                i, len(sub_cluster), sum(map(len, sub_cluster))))
+            pprint.pprint(sub_cluster)
+        print("\nkeys")
+        pprint.pprint(keys)
+        print("")
+
+    for i, result in enumerate(results):
+        print(
+            "result scores for \u001b[31msub cluster {} and key {}\u001b[0m".format(i, i))
+        pprint.pprint(result)
+
+def benchmark_fn(fn, run_time = 15.0):
+    times = []
+    num_runs = 0
+    fn()
+    t = 0.0
+    while (t < run_time):
+        ti = time.time()
+        fn()
+        torch.cuda.synchronize()
+        ti = time.time() - ti
+        t += ti
+        times.append(ti)
+    times = torch.tensor(times) * 1e6
+    return "fn {:<15} avg(us): {:10.4f} std(us): {:10.4f} num_runs: {}".format(fn.__name__, times.mean().item(), times.std().item(), len(times))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--print-results', dest='print_results', action='store_true')
+    args = parser.parse_args()
+    # NOTE: This dodging creating these subclusters from a single set of clusters
+    # This additional memory pressure might be crucial
+    keys = [gen_tensor()] * 16
+    clusters = gen_clusters(16, (16,16))
+    sub_clusters = [[clusters[random.randint(0, 15)]] * 8 for _ in range(16)]
+
+    # Two keys for now
+    # Simulating some overlap
+
+    sub_clusters = [clusters[:3], clusters[2:]]
+
+    # Get algorithm
+    gen_results_naive = gen_algorithm_naive(keys, sub_clusters)
+    gen_results_mv = gen_algorithm_mv(keys, sub_clusters)
+    gen_results_nested_mv = gen_algorithm_nested_mv(keys, sub_clusters)
+    gen_results_nested_jit_mv = gen_algorithm_nested_jit_mv(keys, sub_clusters)
+
+    # print(benchmark_fn(gen_results_naive))
+    # print(benchmark_fn(gen_results_mv))
+    # print(benchmark_fn(gen_results_nested_mv))
+    print(benchmark_fn(gen_results_nested_jit_mv))
+    # import cProfile, pstats, io
+    # pr = cProfile.Profile()
+    # pr.enable()
+    # pr.disable()
+    # s = io.StringIO()
+    # sortby = 'tottime'
+    # ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
+    # ps.print_stats()
+    # print(s.getvalue())
+    # print(benchmark_fn(gen_results_nested_mv))
+
+    if args.print_results:
+        print('naive')
+        print_results(gen_results_naive(), keys, sub_clusters)
+        print('\nmv')
+        print_results(gen_results_mv(), keys, sub_clusters)
+        print('\nnested_mv')
+        print_results(gen_results_nested_mv(), keys, sub_clusters)