StanfordSpezi · PSchmiedmayer · Oct 29, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 18, 2024
diff --git a/Package.swift b/Package.swift
@@ -27,8 +27,10 @@ let package = Package(
         .library(name: "SpeziLLMFog", targets: ["SpeziLLMFog"])
     ],
     dependencies: [
+        .package(url: "https://github.com/ml-explore/mlx-swift", from: "0.18.1"),
+        .package(url: "https://github.com/ml-explore/mlx-swift-examples", from: "1.16.0"),
+        .package(url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "0.1.12")),
         .package(url: "https://github.com/StanfordBDHG/OpenAI", .upToNextMinor(from: "0.2.9")),
-        .package(url: "https://github.com/StanfordBDHG/llama.cpp", .upToNextMinor(from: "0.3.3")),
         .package(url: "https://github.com/StanfordSpezi/Spezi", from: "1.2.1"),
         .package(url: "https://github.com/StanfordSpezi/SpeziFoundation", from: "2.0.0-beta.3"),
         .package(url: "https://github.com/StanfordSpezi/SpeziStorage", from: "1.0.2"),
@@ -49,19 +51,24 @@ let package = Package(
             name: "SpeziLLMLocal",
             dependencies: [
                 .target(name: "SpeziLLM"),
-                .product(name: "llama", package: "llama.cpp"),
                 .product(name: "SpeziFoundation", package: "SpeziFoundation"),
-                .product(name: "Spezi", package: "Spezi")
-            ],
-            swiftSettings: [
-                .interoperabilityMode(.Cxx)
+                .product(name: "Spezi", package: "Spezi"),
+                .product(name: "MLX", package: "mlx-swift"),
+                .product(name: "MLXFast", package: "mlx-swift"),
+                .product(name: "MLXNN", package: "mlx-swift"),
+                .product(name: "MLXOptimizers", package: "mlx-swift"),
+                .product(name: "MLXRandom", package: "mlx-swift"),
+                .product(name: "Transformers", package: "swift-transformers"),
+                .product(name: "LLM", package: "mlx-swift-examples")
             ]
         ),
         .target(
             name: "SpeziLLMLocalDownload",
             dependencies: [
                 .product(name: "SpeziOnboarding", package: "SpeziOnboarding"),
-                .product(name: "SpeziViews", package: "SpeziViews")
+                .product(name: "SpeziViews", package: "SpeziViews"),
+                .target(name: "SpeziLLMLocal"),
+                .product(name: "LLM", package: "mlx-swift-examples")
             ]
         ),
         .target(

diff --git a/README.md b/README.md
@@ -57,37 +57,13 @@ The section below highlights the setup and basic use of the [SpeziLLMLocal](http
 
 ### Spezi LLM Local
 
-The target enables developers to easily execute medium-size Language Models (LLMs) locally on-device via the [llama.cpp framework](https://github.com/ggerganov/llama.cpp). The module allows you to interact with the locally run LLM via purely Swift-based APIs, no interaction with low-level C or C++ code is necessary, building on top of the infrastructure of the [SpeziLLM target](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillm).
+The target enables developers to easily execute medium-size Language Models (LLMs) locally on-device. The module allows you to interact with the locally run LLM via purely Swift-based APIs, no interaction with low-level code is necessary, building on top of the infrastructure of the [SpeziLLM target](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillm).
+
+> [!IMPORTANT]  
+> Spezi LLM Local is not compatible with simulators. The underlying [`mlx-swift`](https://github.com/ml-explore/mlx-swift) requires a modern Metal MTLGPUFamily and the simulator does not provide that.
 
 > [!IMPORTANT]
-> Important: In order to use the LLM local target, one needs to set build parameters in the consuming Xcode project or the consuming SPM package to enable the [Swift / C++ Interop](https://www.swift.org/documentation/cxx-interop/), introduced in Xcode 15 and Swift 5.9. Keep in mind that this is true for nested dependencies, one needs to set this configuration recursivly for the entire dependency tree towards the llama.cpp SPM package. <!-- markdown-link-check-disable-line -->
-> 
-> **For Xcode projects:**
-> - Open your [build settings in Xcode](https://developer.apple.com/documentation/xcode/configuring-the-build-settings-of-a-target/) by selecting *PROJECT_NAME > TARGET_NAME > Build Settings*.
-> - Within the *Build Settings*, search for the `C++ and Objective-C Interoperability` setting and set it to `C++ / Objective-C++`. This enables the project to use the C++ headers from llama.cpp.
->
-> **For SPM packages:**
-> - Open the `Package.swift` file of your [SPM package]((https://www.swift.org/documentation/package-manager/)) <!-- markdown-link-check-disable-line -->
-> - Within the package `target` that consumes the llama.cpp package, add the `interoperabilityMode(_:)` Swift build setting like that:
-> ```swift
-> /// Adds the dependency to the Spezi LLM SPM package
-> dependencies: [
->     .package(url: "https://github.com/StanfordSpezi/SpeziLLM", .upToNextMinor(from: "0.6.0"))
-> ],
-> targets: [
->   .target(
->       name: "ExampleConsumingTarget",
->       /// State the dependence of the target to SpeziLLMLocal
->       dependencies: [
->           .product(name: "SpeziLLMLocal", package: "SpeziLLM")
->       ],
->       /// Important: Configure the `.interoperabilityMode(_:)` within the `swiftSettings`
->       swiftSettings: [
->           .interoperabilityMode(.Cxx)
->       ]
->   )
-> ]
-> ```
+> Important: To use the LLM local target, some LLMs require adding the [Increase Memory Limit](https://developer.apple.com/documentation/bundleresources/entitlements/com_apple_developer_kernel_increased-memory-limit) entitlement to the project.
 
 #### Setup
 
@@ -123,7 +99,8 @@ struct LLMLocalDemoView: View {
                 // Instantiate the `LLMLocalSchema` to an `LLMLocalSession` via the `LLMRunner`.
                 let llmSession: LLMLocalSession = runner(
                     with: LLMLocalSchema(
-                        modelPath: URL(string: "URL to the local model file")!
+                        model: .llama3_8B_4bit,
+                        formatChat: LLMLocalSchema.PromptFormattingDefaults.llama3
                     )
                 )
 

diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift
@@ -1,218 +1,27 @@
 //
 // This source file is part of the Stanford Spezi open source project
 //
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
+// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md)
 //
 // SPDX-License-Identifier: MIT
 //
 
 import Foundation
-@preconcurrency import llama
 
 
 /// Represents the context parameters of the LLM.
-/// 
-/// Internally, these data points are passed as a llama.cpp `llama_context_params` C struct to the LLM.
 public struct LLMLocalContextParameters: Sendable {
-    // swiftlint:disable identifier_name
-    /// Swift representation of the `ggml_type` of llama.cpp, indicating data types within KV caches.
-    public enum GGMLType: UInt32 {
-        case f32 = 0
-        case f16
-        case q4_0
-        case q4_1
-        case q5_0 = 6
-        case q5_1
-        case q8_0
-        case q8_1
-        /// k-quantizations
-        case q2_k
-        case q3_k
-        case q4_k
-        case q5_k
-        case q6_k
-        case q8_k
-        case iq2_xxs
-        case iq2_xs
-        case i8
-        case i16
-        case i32
-    }
-    // swiftlint:enable identifier_name
-
-
-    /// Wrapped C struct from the llama.cpp library, later-on passed to the LLM
-    private var wrapped: llama_context_params
-
-
-    /// Context parameters in llama.cpp's low-level C representation
-    var llamaCppRepresentation: llama_context_params {
-        wrapped
-    }
-
     /// RNG seed of the LLM
-    var seed: UInt32 {
-        get {
-            wrapped.seed
-        }
-        set {
-            wrapped.seed = newValue
-        }
-    }
-
-    /// Context window size in tokens (0 = take default window size from model)
-    var contextWindowSize: UInt32 {
-        get {
-            wrapped.n_ctx
-        }
-        set {
-            wrapped.n_ctx = newValue
-        }
-    }
-
-    /// Maximum batch size during prompt processing
-    var batchSize: UInt32 {
-        get {
-            wrapped.n_batch
-        }
-        set {
-            wrapped.n_batch = newValue
-        }
-    }
-
-    /// Number of threads used by LLM for generation of output
-    var threadCount: UInt32 {
-        get {
-            wrapped.n_threads
-        }
-        set {
-            wrapped.n_threads = newValue
-        }
-    }
-
-    /// Number of threads used by LLM for batch processing
-    var threadCountBatch: UInt32 {
-        get {
-            wrapped.n_threads_batch
-        }
-        set {
-            wrapped.n_threads_batch = newValue
-        }
-    }
-
-    /// RoPE base frequency (0 = take default from model)
-    var ropeFreqBase: Float {
-        get {
-            wrapped.rope_freq_base
-        }
-        set {
-            wrapped.rope_freq_base = newValue
-        }
-    }
-
-    /// RoPE frequency scaling factor (0 = take default from model)
-    var ropeFreqScale: Float {
-        get {
-            wrapped.rope_freq_scale
-        }
-        set {
-            wrapped.rope_freq_scale = newValue
-        }
-    }
-
-    /// If `true`, offload the KQV ops (including the KV cache) to GPU
-    var offloadKQV: Bool {
-        get {
-            wrapped.offload_kqv
-        }
-        set {
-            wrapped.offload_kqv = newValue
-        }
-    }
-
-    /// ``GGMLType`` of the key of the KV cache
-    var kvKeyType: GGMLType {
-        get {
-            GGMLType(rawValue: wrapped.type_k.rawValue) ?? .f16
-        }
-        set {
-            wrapped.type_k = ggml_type(rawValue: newValue.rawValue)
-        }
-    }
-
-    /// ``GGMLType`` of the value of the KV cache
-    var kvValueType: GGMLType {
-        get {
-            GGMLType(rawValue: wrapped.type_v.rawValue) ?? .f16
-        }
-        set {
-            wrapped.type_v = ggml_type(rawValue: newValue.rawValue)
-        }
-    }
-
-    /// If `true`, the (deprecated) `llama_eval()` call computes all logits, not just the last one
-    var computeAllLogits: Bool {
-        get {
-            wrapped.logits_all
-        }
-        set {
-            wrapped.logits_all = newValue
-        }
-    }
-
-    /// If `true`, the mode is set to embeddings only
-    var embeddingsOnly: Bool {
-        get {
-            wrapped.embeddings
-        }
-        set {
-            wrapped.embeddings = newValue
-        }
-    }
+    var seed: UInt64?
 
     /// Creates the ``LLMLocalContextParameters`` which wrap the underlying llama.cpp `llama_context_params` C struct.
     /// Is passed to the underlying llama.cpp model in order to configure the context of the LLM.
     ///
     /// - Parameters:
-    ///   - seed: RNG seed of the LLM, defaults to `4294967295` (which represents a random seed).
-    ///   - contextWindowSize: Context window size in tokens, defaults to `1024`.
-    ///   - batchSize: Maximum batch size during prompt processing, defaults to `1024` tokens.
-    ///   - threadCount: Number of threads used by LLM for generation of output, defaults to the processor count of the device.
-    ///   - threadCountBatch: Number of threads used by LLM for batch processing, defaults to the processor count of the device.
-    ///   - ropeFreqBase: RoPE base frequency, defaults to `0` indicating the default from model.
-    ///   - ropeFreqScale: RoPE frequency scaling factor, defaults to `0` indicating the default from model.
-    ///   - offloadKQV: Offloads the KQV ops (including the KV cache) to GPU, defaults to `true`.
-    ///   - kvKeyType: ``GGMLType`` of the key of the KV cache, defaults to ``GGMLType/f16``.
-    ///   - kvValueType: ``GGMLType`` of the value of the KV cache, defaults to ``GGMLType/f16``.
-    ///   - computeAllLogits: `llama_eval()` call computes all logits, not just the last one. Defaults to `false`.
-    ///   - embeddingsOnly: Embedding-only mode, defaults to `false`.
+    ///   - seed: RNG seed of the LLM, defaults to a random seed.
     public init(
-        seed: UInt32 = 4294967295,
-        contextWindowSize: UInt32 = 1024,
-        batchSize: UInt32 = 1024,
-        threadCount: UInt32 = .init(ProcessInfo.processInfo.processorCount),
-        threadCountBatch: UInt32 = .init(ProcessInfo.processInfo.processorCount),
-        ropeFreqBase: Float = 0.0,
-        ropeFreqScale: Float = 0.0,
-        offloadKQV: Bool = true,
-        kvKeyType: GGMLType = .f16,
-        kvValueType: GGMLType = .f16,
-        computeAllLogits: Bool = false,
-        embeddingsOnly: Bool = false
+        seed: UInt64? = nil
     ) {
-        self.wrapped = llama_context_default_params()
-
         self.seed = seed
-        self.contextWindowSize = contextWindowSize
-        self.batchSize = batchSize
-        self.threadCount = threadCount
-        self.threadCountBatch = threadCountBatch
-        self.ropeFreqBase = ropeFreqBase
-        self.ropeFreqScale = ropeFreqScale
-        self.offloadKQV = offloadKQV
-        self.kvKeyType = kvKeyType
-        self.kvValueType = kvValueType
-        self.computeAllLogits = computeAllLogits
-        self.embeddingsOnly = embeddingsOnly
     }
 }