|
| 1 | +package io.github.quafadas.scautable |
| 2 | + |
| 3 | +import java.io.File |
| 4 | +import scala.NamedTuple.* |
| 5 | +import scala.collection.JavaConverters.* |
| 6 | +import org.apache.poi.ss.usermodel.{Row, WorkbookFactory} |
| 7 | +import org.apache.poi.ss.util.CellRangeAddress |
| 8 | +import io.github.quafadas.scautable.BadTableException |
| 9 | + |
| 10 | +/** Iterator for reading Excel files with compile-time type safety |
| 11 | + * |
| 12 | + * @param filePath |
| 13 | + * Path to the Excel file |
| 14 | + * @param sheetName |
| 15 | + * Name of the Excel sheet to read |
| 16 | + * @param colRange |
| 17 | + * Optional cell range specification (e.g., "A1:C10") |
| 18 | + * @param decoder |
| 19 | + * Row decoder for converting string data to typed tuples |
| 20 | + * @tparam K |
| 21 | + * Tuple type representing column names |
| 22 | + * @tparam V |
| 23 | + * Tuple type representing column value types |
| 24 | + */ |
| 25 | +class ExcelIterator[K <: Tuple, V <: Tuple](filePath: String, sheetName: String, colRange: Option[String])(using decoder: RowDecoder[V]) extends Iterator[NamedTuple[K, V]]: |
| 26 | + |
| 27 | + type COLUMNS = K |
| 28 | + |
| 29 | + // Public accessors for compile-time code generation |
| 30 | + def getFilePath: String = filePath |
| 31 | + def getSheet: String = sheetName |
| 32 | + def getColRange: Option[String] = colRange |
| 33 | + |
| 34 | + /** Parses a cell range string into its components |
| 35 | + */ |
| 36 | + private def parseRange(range: String): (Int, Int, Int, Int) = |
| 37 | + val cellRange = CellRangeAddress.valueOf(range) |
| 38 | + (cellRange.getFirstRow, cellRange.getLastRow, cellRange.getFirstColumn, cellRange.getLastColumn) |
| 39 | + end parseRange |
| 40 | + |
| 41 | + /** Validates that headers are unique (no duplicates) |
| 42 | + */ |
| 43 | + private def validateUniqueHeaders(headers: List[String]): Unit = |
| 44 | + val headerSet = scala.collection.mutable.Set[String]() |
| 45 | + headers.foreach { header => |
| 46 | + if headerSet.contains(header) then throw new BadTableException(s"Duplicate header found: $header, which will not work.") |
| 47 | + else headerSet.add(header) |
| 48 | + } |
| 49 | + end validateUniqueHeaders |
| 50 | + |
| 51 | + // Lazy-initialized sheet iterator to avoid opening file until needed |
| 52 | + private lazy val sheetIterator = |
| 53 | + val workbook = WorkbookFactory.create(new File(filePath)) |
| 54 | + val sheet = workbook.getSheet(sheetName) |
| 55 | + sheet.iterator().asScala |
| 56 | + end sheetIterator |
| 57 | + |
| 58 | + // Track current row number for error reporting - starts where data begins |
| 59 | + private var currentRowIndex: Int = colRange match |
| 60 | + case None => 0 |
| 61 | + case Some(range) if range.nonEmpty => |
| 62 | + val (firstRow, _, _, _) = parseRange(range) |
| 63 | + firstRow |
| 64 | + case _ => 0 |
| 65 | + |
| 66 | + // Extract headers from the first row or specified range |
| 67 | + private val headers: List[String] = |
| 68 | + colRange match |
| 69 | + case Some(range) if range.nonEmpty => |
| 70 | + extractHeadersFromRange(range) |
| 71 | + case _ => |
| 72 | + extractHeadersFromFirstRow() |
| 73 | + |
| 74 | + private lazy val numCellsPerRow = headers.size |
| 75 | + |
| 76 | + // Validate headers are unique at initialization |
| 77 | + validateUniqueHeaders(headers) |
| 78 | + |
| 79 | + /** Extract headers from a specified cell range This consumes the header row from the sheet iterator |
| 80 | + */ |
| 81 | + private inline def extractHeadersFromRange(range: String): List[String] = |
| 82 | + val (firstRow, _, firstCol, lastCol) = parseRange(range) |
| 83 | + val headerRow = sheetIterator.drop(firstRow).next() |
| 84 | + val cells = |
| 85 | + for (i <- firstCol.to(lastCol)) |
| 86 | + yield headerRow.getCell(i, Row.MissingCellPolicy.CREATE_NULL_AS_BLANK).toString |
| 87 | + cells.toList |
| 88 | + end extractHeadersFromRange |
| 89 | + |
| 90 | + /** Extract headers from the first row of the sheet This consumes the header row from the sheet iterator |
| 91 | + */ |
| 92 | + private inline def extractHeadersFromFirstRow(): List[String] = |
| 93 | + if sheetIterator.hasNext then sheetIterator.next().cellIterator().asScala.toList.map(_.toString) |
| 94 | + else throw new BadTableException("No headers found in the first row of the sheet, and no range specified.") |
| 95 | + end extractHeadersFromFirstRow |
| 96 | + |
| 97 | + /** Extract cell values from a row based on the column range |
| 98 | + */ |
| 99 | + private inline def extractCellValues(row: org.apache.poi.ss.usermodel.Row): List[String] = |
| 100 | + colRange match |
| 101 | + case Some(range) if range.nonEmpty => |
| 102 | + val (_, _, firstCol, lastCol) = parseRange(range) |
| 103 | + val cells = |
| 104 | + for (i <- firstCol.to(lastCol)) |
| 105 | + yield row.getCell(i, Row.MissingCellPolicy.CREATE_NULL_AS_BLANK).toString |
| 106 | + cells.toList |
| 107 | + case _ => |
| 108 | + row.cellIterator().asScala.toList.map(_.toString) |
| 109 | + end extractCellValues |
| 110 | + |
| 111 | + override def next(): NamedTuple[K, V] = |
| 112 | + if !hasNext then throw new NoSuchElementException("No more rows") |
| 113 | + end if |
| 114 | + |
| 115 | + val row = sheetIterator.next() |
| 116 | + val cellValues = extractCellValues(row) |
| 117 | + |
| 118 | + // Validate row has expected number of cells |
| 119 | + if cellValues.size != headers.size then |
| 120 | + throw new BadTableException( |
| 121 | + s"Row $currentRowIndex has ${cellValues.size} cells, but expected ${headers.size} cells. Reading terminated." |
| 122 | + ) |
| 123 | + end if |
| 124 | + |
| 125 | + // Decode the row using the provided decoder |
| 126 | + val decodedTuple = decoder |
| 127 | + .decodeRow(cellValues) |
| 128 | + .getOrElse( |
| 129 | + throw new Exception(s"Failed to decode row $currentRowIndex: $cellValues") |
| 130 | + ) |
| 131 | + |
| 132 | + currentRowIndex += 1 |
| 133 | + NamedTuple.build[K]()(decodedTuple) |
| 134 | + end next |
| 135 | + |
| 136 | + override def hasNext: Boolean = |
| 137 | + colRange match |
| 138 | + case Some(range) if range.nonEmpty => |
| 139 | + val (_, lastRow, _, _) = parseRange(range) |
| 140 | + currentRowIndex < lastRow |
| 141 | + case _ => |
| 142 | + sheetIterator.hasNext |
| 143 | + end hasNext |
| 144 | + |
| 145 | +end ExcelIterator |
0 commit comments