Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement to_csv #24

Merged
merged 1 commit into from
Nov 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 33 additions & 1 deletion src/Pandas.NET/Extensions/PandasMethods.Excel.cs
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
using System;
using System.Collections.Generic;
using System.Text;
using PandasNet.Impl;

namespace PandasNet
{
public static class PandasMethods
{
/// <summary>
///
/// Read a comma-separated values (csv) file into DataFrame.
/// </summary>
/// <param name="pd"></param>
/// <param name="filepath_or_buffer"></param>
Expand All @@ -18,5 +19,36 @@ public static IDataFrame read_csv(this Pandas pd, string filepath, string sep =
{
throw new NotImplementedException();
}

/// <summary>
/// Write object to a comma-separated values (csv) file.
/// </summary>
/// <param name="filepath">File path.</param>
/// <param name="sep">Field delimiter for the output file.</param>
/// <param name="na_rep">Missing data representation.</param>
/// <param name="float_format">Format string for floating point numbers.</param>
/// <param name="columns">Columns to write.</param>
/// <param name="header">Write out the column names.</param>
/// <param name="quoting">
/// Defaults to QUOTE_MINIMAL. If you have set a float_format then
/// floats are converted to strings and thus QUOTE_NONNUMERIC will
/// treat them as non-numeric.
/// </param>
/// <param name="quotechar">Character used to quote fields.</param>
/// <param name="line_terminator">
/// The newline character or character sequence to use in the output
/// file. Defaults to os.linesep, which depends on the OS in which this
/// method is called (‘n’ for linux, ‘rn’ for Windows, i.e.).
/// </param>
public static void to_csv(this IDataFrame df, string filepath, char sep = ',',
string na_rep = "", string float_format = null, IEnumerable<string> columns = null,
bool header = true, int quoting = (int) CsvQuoteStyle.QUOTE_MINIMAL,
char quotechar = '"', string line_terminator = null)
{
new CsvWriter(sep, na_rep, float_format, header,
(CsvQuoteStyle) quoting, quotechar, string.IsNullOrEmpty(
line_terminator) ? Environment.NewLine : line_terminator,
new UTF8Encoding(false)).Write(filepath, df, columns);
}
}
}
158 changes: 158 additions & 0 deletions src/Pandas.NET/Impl/CsvWriter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using NumSharp;

namespace PandasNet.Impl
{
internal class CsvWriter
{
private readonly byte delimiter;
private readonly byte[] noValue;
private readonly string floatFormat;
private readonly bool header;
private readonly CsvQuoteStyle quotingStyle;
private readonly byte quotebyte;
private readonly char[] specialChars;
private readonly Encoding encoding;
private readonly byte[] lfBytes;

internal CsvWriter(char sep, string na_rep, string floatFormat,
bool header, CsvQuoteStyle quotingStyle, char quotechar,
string lineTerminator, Encoding encoding)
{
delimiter = (byte) sep;
noValue = encoding.GetBytes(na_rep);
this.floatFormat = floatFormat;
this.header = header;
this.quotingStyle = quotingStyle;
quotebyte = (byte) quotechar;
specialChars = lineTerminator.Length > 1 ?
new char[] { sep, quotechar, lineTerminator[0], lineTerminator[1] }
: new char[] { sep, quotechar, lineTerminator[0] };
this.encoding = encoding;
lfBytes = encoding.GetBytes(lineTerminator);
}

internal void Write(string filepath, IDataFrame df,
IEnumerable<string> columns)
{
var columnLabels = columns == null ?
df.Columns.Values.Data<string>() : columns.ToArray();
var columnCount = columnLabels.Length;
int rowCount = df.Index.Size;
var data = df[columnLabels].Values;
using (var fs = File.Create(filepath))
{
if (columnCount == 0) { return; }
else if (header) { WriteHeader(fs, columnLabels); }
for (var i = 0; i < rowCount; i++)
{
WriteField(data[i][0], fs);
for (var j = 1; j < columnCount; j++)
{
fs.WriteByte(delimiter);
WriteField(data[i][j], fs);
}
fs.Write(lfBytes, 0, lfBytes.Length);
}
}
}

private void WriteField(NDArray fieldValue, Stream fs)
{
var needsQuoting = NeedsQuoting(fieldValue);
if (needsQuoting) { fs.WriteByte(quotebyte); }
var bytes = noValue;
if (fieldValue.size > 0)
{
var fieldValueFormatted = floatFormat != null &&
(fieldValue.dtype == np.float32 || fieldValue.dtype == np.float64)
? ((double) fieldValue).ToString(floatFormat)
: fieldValue.ToString();
bytes = encoding.GetBytes(fieldValueFormatted);
}
fs.Write(bytes, 0, bytes.Length);
if (needsQuoting) { fs.WriteByte(quotebyte); }
}

private bool NeedsQuoting(object field)
{
switch (quotingStyle)
{
case CsvQuoteStyle.QUOTE_MINIMAL:
return !IsNumber(field) && -1 != field.ToString().IndexOfAny(specialChars);
case CsvQuoteStyle.QUOTE_ALL:
return true;
case CsvQuoteStyle.QUOTE_NONNUMERIC:
return !IsNumber(field);
case CsvQuoteStyle.QUOTE_NONE:
return false;
default:
throw new ArgumentException("Invalid value", nameof(quotingStyle));
}
}

/// <summary>
/// Writes the columnLabels on one line to the FileStream.
/// </summary>
/// <param name="fs">Output stream</param>
/// <param name="encoding">Byte encoding used</param>
/// <param name="columnLabels">Column names</param>
/// <param name="delimiter">Separator for columns</param>
/// <param name="lfBytes">Line-break bytes.</param>
private void WriteHeader(Stream fs, string[] columnLabels)
{
var bytes = encoding.GetBytes(columnLabels[0]);
fs.Write(bytes, 0, bytes.Length);
for (var i = 1; i < columnLabels.Length; i++)
{
fs.WriteByte(delimiter);
bytes = encoding.GetBytes(columnLabels[i]);
fs.Write(bytes, 0, bytes.Length);
}
fs.Write(lfBytes, 0, lfBytes.Length);
}

private static bool IsNumber(object value)
{
return value is sbyte || value is byte || value is short ||
value is ushort || value is int || value is uint ||
value is long || value is ulong || value is float ||
value is double || value is decimal;
}
}

internal enum CsvQuoteStyle
{
/// <summary>
/// Instructs writer objects to only quote those fields which
/// contain special characters such as delimiter, quotechar or any
/// of the characters in lineterminator.
/// </summary>
QUOTE_MINIMAL = 0,
/// <summary>
/// Instructs writer objects to quote all fields.
/// </summary>
QUOTE_ALL = 1,
/// <summary>
/// <para>Instructs writer objects to quote all non-numeric
/// fields.</para>
/// <para>Instructs the reader to convert all non-quoted fields
/// to type float.</para>
/// </summary>
QUOTE_NONNUMERIC = 2,
/// <summary>
/// <para>Instructs writer objects to never quote fields. When the
/// current delimiter occurs in output data it is preceded by the
/// current escapechar character. If escapechar is not set, the
/// writer will raise Error if any characters that require escaping
/// are encountered.</para>
/// <para>Instructs reader to perform no special processing of
/// quote characters.</para>
/// </summary>
QUOTE_NONE = 3
}
}
8 changes: 7 additions & 1 deletion src/Pandas.NET/Impl/DataFrame.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,14 @@ namespace PandasNet.Impl
{
public partial class DataFrame<TIndex> : PandasObject, IDataFrame
{
/// <summary>
/// The index (row labels) of the DataFrame.
/// </summary>
public IDataIndex Index { get; internal set; }

/// <summary>
/// The column labels of the DataFrame.
/// </summary>
public IDataIndex Columns { get; internal set; }

/// <summary>
Expand Down Expand Up @@ -140,7 +146,7 @@ public IDataFrame this[params int[] columnIndexs]
get
{
var colLength = columnIndexs.Length;
NDArray array = new object[_rowSize, colLength];
NDArray array = new NDArray(Values.dtype, new Shape(_rowSize, colLength));
for (var rowIndex = 0; rowIndex < _rowSize; rowIndex++)
{
for (var col = 0; col < colLength; col++)
Expand Down
4 changes: 2 additions & 2 deletions src/Pandas.NET/Pandas.Net.csproj
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>netstandard2.0</TargetFramework>
<TargetFramework>netstandard2.0</TargetFramework>
<RootNamespace>PandasNet</RootNamespace>
<Version>0.1.0</Version>
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
Expand Down
76 changes: 76 additions & 0 deletions test/Pandas.NET.Test/DataFrameCsvTest.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
using Xunit;
using NumSharp;
using System.Linq;
using System.IO;

namespace PandasNet.Test
{
public class DataFrameCsvTest
{
public DataFrameCsvTest()
{}

[Fact]
public void WriteCsv_ToFile_Test()
{
var filepath = "write_test.csv";
var array = np.arange(100).reshape(20, 5);
var columnNames = new string[] { "first", "second", "third",
"fourth", "fifth" };
var pd = new Pandas();
IDataFrame df1 = pd.DataFrame(array, null, columnNames, typeof(object));
df1.to_csv(filepath);
using (var fr = File.OpenText(filepath))
{
Assert.Equal(string.Join(',', columnNames), fr.ReadLine());
for (var i = 0; i < array.shape[0]; i++)
{
Assert.Equal(string.Join(',', array[i].Data<int>()), fr.ReadLine());
}
}
}

[Fact]
public void WriteCsvQuoted_ToFile_Test()
{
var filepath = "write_quoted_test.csv";
var array = np.arange(100).reshape(20, 5);
var columnNames = new string[] { "first", "second", "third",
"fourth", "fifth" };
var pd = new Pandas();
IDataFrame df1 = pd.DataFrame(array, null, columnNames, typeof(object));
df1.to_csv(filepath, quoting: 1);
using (var fr = File.OpenText(filepath))
{
Assert.Equal(string.Join(',', columnNames), fr.ReadLine());
for (var i = 0; i < array.shape[0]; i++)
{
Assert.Equal('"' + string.Join("\",\"", array[i].Data<int>()) + '"', fr.ReadLine());
}
}
}

[Fact]
public void WriteCsvFormated_ToFile_Test()
{
var filepath = "write_quoted_test.csv";
var array = np.arange(0, 50, 0.5).reshape(20, 5);
var columnNames = new string[] { "first", "second", "third",
"fourth", "fifth" };
var floatFormat = "E03";
var pd = new Pandas();
IDataFrame df1 = pd.DataFrame(array, null, columnNames, typeof(object));
df1.to_csv(filepath, float_format: floatFormat);
using (var fr = File.OpenText(filepath))
{
Assert.Equal(string.Join(',', columnNames), fr.ReadLine());
for (var i = 0; i < array.shape[0]; i++)
{
var formattedData = array[i].Data<double>().Select(
x => x.ToString(floatFormat));
Assert.Equal(string.Join(",", formattedData), fr.ReadLine());
}
}
}
}
}