High CPU usage and slow loading speeds when parsing large xlsx files

Hi, I'm working on a tool that extracts info from an xlsx file (3 columns: filename, character and timeconstraint, for use in game audio localisation, the columns come from the script) and uses this to check a delivery of audio fles or prepare batches of audio files based on those 3 values. When working with files of about 2500 rows in the xlsx file, the app runs fine, but when the files get bigger, the speed decrease rapidly. It takes a full minute to load the data from a file with 47.000 rows for example. Any thoughts on how this process can be sped up? Right now I'm parsing the data to a Core Data Model in SwiftUI/Xcode for a macOS app, using the following code:

func readXLSXFile(fileURL: URL, filenameColumn: String, characterColumn: String, timeConstraintColumn: String, completion: @escaping () -> Void) {
        isParsing = true
        DispatchQueue.global(qos: .userInitiated).async { [unowned self] in
            do {
                guard let file = XLSXFile(filepath: fileURL.path),
                      let sharedStrings = try file.parseSharedStrings(),
                      let firstWorksheet = try file.parseWorksheetPaths().first else {
                    DispatchQueue.main.async {
                        print("Error: File could not be opened or worksheet not found.")
                        self.isParsing = false
                        completion()
                    }
                    return
                }

                let worksheet = try file.parseWorksheet(at: firstWorksheet)
                let rows = Array(worksheet.data?.rows.dropFirst(headerRows) ?? [])
                let rowCount = rows.count
                let chunkSize = 2500 // Adjust chunk size based on your needs

                DispatchQueue.concurrentPerform(iterations: (rowCount + chunkSize - 1) / chunkSize) { index in
                    let start = index * chunkSize
                    let end = min(start + chunkSize, rowCount)
                    if start < rowCount { // Ensure the start index is within the row count
                        let range = start..<end
                        let chunkRows = Array(rows[range])

                        let context = PersistenceController.shared.newBackgroundContext()
                        context.performAndWait {
                            for row in chunkRows {
                                self.processRow(row, in: context, with: sharedStrings, filenameColumn: filenameColumn, characterColumn: characterColumn, timeConstraintColumn: timeConstraintColumn)
                            }
                            try? context.save()
                            context.reset()
                        }
                    }
                }

                DispatchQueue.main.async {
                    self.isParsing = false
                    completion() // Signal that the operation has completed
                }
            } catch {
                DispatchQueue.main.async {
                    print("Error parsing file: \(error)")
                    self.isParsing = false
                    completion()
                }
            }
        }
    }


    private func processRow(_ row: Row, in context: NSManagedObjectContext, with sharedStrings: SharedStrings, filenameColumn: String, characterColumn: String, timeConstraintColumn: String) {
        let filenameCell = row.cells.first(where: { $0.reference.column == ColumnReference(filenameColumn) })
        let characterCell = row.cells.first(where: { $0.reference.column == ColumnReference(characterColumn) })
        let timeConstraintCell = row.cells.first(where: { $0.reference.column == ColumnReference(timeConstraintColumn) })

        let filename = filenameCell?.stringValue(sharedStrings) ?? ""
        let character = characterCell?.stringValue(sharedStrings) ?? ""
        let timeConstraint = timeConstraintCell?.stringValue(sharedStrings) ?? ""

        let record = Record(context: context) // Create a new Record instance
        record.filename = filename
        record.character = character
        record.timeConstraint = timeConstraint
    }


    private func batchInsertRecords(records: [Record], into context: NSManagedObjectContext) {
        for record in records {
            context.insert(record)
        }
        do {
            try context.save()
            context.reset() // Clear the context to free memory
        } catch {
            print("Failed to batch insert records: \(error)")
        }
    }
    

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

High CPU usage and slow loading speeds when parsing large xlsx files #181

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

High CPU usage and slow loading speeds when parsing large xlsx files #181

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions