Skip to content

inconsistent parquet format between hits.parquet and hits_0.parquet #18

@waitingkuo

Description

@waitingkuo

hits.parquet has the String tag for binaries; all the fields are required

e.g. required binary field_id=-1 Title (String);

In [7]: pq.read_metadata('hits.parquet').schema
Out[7]: 
<pyarrow._parquet.ParquetSchema object at 0x11238ab40>
required group field_id=-1 schema {
  required int64 field_id=-1 WatchID;
  required int32 field_id=-1 JavaEnable (Int(bitWidth=16, isSigned=true));
  required binary field_id=-1 Title (String);
  required int32 field_id=-1 GoodEvent (Int(bitWidth=16, isSigned=true));
  required int64 field_id=-1 EventTime;
  required int32 field_id=-1 EventDate (Int(bitWidth=16, isSigned=false));
  required int32 field_id=-1 CounterID;
  required int32 field_id=-1 ClientIP;
  required int32 field_id=-1 RegionID;
  required int64 field_id=-1 UserID;
  required int32 field_id=-1 CounterClass (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 OS (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 UserAgent (Int(bitWidth=16, isSigned=true));
  required binary field_id=-1 URL (String);
  required binary field_id=-1 Referer (String);
  required int32 field_id=-1 IsRefresh (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 RefererCategoryID (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 RefererRegionID;
  required int32 field_id=-1 URLCategoryID (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 URLRegionID;
  required int32 field_id=-1 ResolutionWidth (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 ResolutionHeight (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 ResolutionDepth (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 FlashMajor (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 FlashMinor (Int(bitWidth=16, isSigned=true));
  required binary field_id=-1 FlashMinor2 (String);
  required int32 field_id=-1 NetMajor (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 NetMinor (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 UserAgentMajor (Int(bitWidth=16, isSigned=true));
  required binary field_id=-1 UserAgentMinor (String);
  required int32 field_id=-1 CookieEnable (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 JavascriptEnable (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 IsMobile (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 MobilePhone (Int(bitWidth=16, isSigned=true));
  required binary field_id=-1 MobilePhoneModel (String);
  required binary field_id=-1 Params (String);
  required int32 field_id=-1 IPNetworkID;
  required int32 field_id=-1 TraficSourceID (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 SearchEngineID (Int(bitWidth=16, isSigned=true));
  required binary field_id=-1 SearchPhrase (String);
  required int32 field_id=-1 AdvEngineID (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 IsArtifical (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 WindowClientWidth (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 WindowClientHeight (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 ClientTimeZone (Int(bitWidth=16, isSigned=true));
  required int64 field_id=-1 ClientEventTime;
  required int32 field_id=-1 SilverlightVersion1 (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 SilverlightVersion2 (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 SilverlightVersion3;
  required int32 field_id=-1 SilverlightVersion4 (Int(bitWidth=16, isSigned=true));
  required binary field_id=-1 PageCharset (String);
  required int32 field_id=-1 CodeVersion;
  required int32 field_id=-1 IsLink (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 IsDownload (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 IsNotBounce (Int(bitWidth=16, isSigned=true));
  required int64 field_id=-1 FUniqID;
  required binary field_id=-1 OriginalURL (String);
  required int32 field_id=-1 HID;
  required int32 field_id=-1 IsOldCounter (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 IsEvent (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 IsParameter (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 DontCountHits (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 WithHash (Int(bitWidth=16, isSigned=true));
  required binary field_id=-1 HitColor (String);
  required int64 field_id=-1 LocalEventTime;
  required int32 field_id=-1 Age (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 Sex (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 Income (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 Interests (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 Robotness (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 RemoteIP;
  required int32 field_id=-1 WindowName;
  required int32 field_id=-1 OpenerName;
  required int32 field_id=-1 HistoryLength (Int(bitWidth=16, isSigned=true));
  required binary field_id=-1 BrowserLanguage (String);
  required binary field_id=-1 BrowserCountry (String);
  required binary field_id=-1 SocialNetwork (String);
  required binary field_id=-1 SocialAction (String);
  required int32 field_id=-1 HTTPError (Int(bitWidth=16, isSigned=true));
  required int32 field_id=-1 SendTiming;
  required int32 field_id=-1 DNSTiming;
  required int32 field_id=-1 ConnectTiming;
  required int32 field_id=-1 ResponseStartTiming;
  required int32 field_id=-1 ResponseEndTiming;
  required int32 field_id=-1 FetchTiming;
  required int32 field_id=-1 SocialSourceNetworkID (Int(bitWidth=16, isSigned=true));
  required binary field_id=-1 SocialSourcePage (String);
  required int64 field_id=-1 ParamPrice;
  required binary field_id=-1 ParamOrderID (String);
  required binary field_id=-1 ParamCurrency (String);
  required int32 field_id=-1 ParamCurrencyID (Int(bitWidth=16, isSigned=true));
  required binary field_id=-1 OpenstatServiceName (String);
  required binary field_id=-1 OpenstatCampaignID (String);
  required binary field_id=-1 OpenstatAdID (String);
  required binary field_id=-1 OpenstatSourceID (String);
  required binary field_id=-1 UTMSource (String);
  required binary field_id=-1 UTMMedium (String);
  required binary field_id=-1 UTMCampaign (String);
  required binary field_id=-1 UTMContent (String);
  required binary field_id=-1 UTMTerm (String);
  required binary field_id=-1 FromTag (String);
  required int32 field_id=-1 HasGCLID (Int(bitWidth=16, isSigned=true));
  required int64 field_id=-1 RefererHash;
  required int64 field_id=-1 URLHash;
  required int32 field_id=-1 CLID;
}

while hits_0.parquet has no String tag for binaries, and all the fields are optional

e.g. optional binary field_id=-1 Title;

In [8]: pq.read_metadata('hits_0.parquet').schema
Out[8]: 
<pyarrow._parquet.ParquetSchema object at 0x114c30fc0>
required group field_id=-1 schema {
  optional int64 field_id=-1 WatchID;
  optional int32 field_id=-1 JavaEnable (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 Title;
  optional int32 field_id=-1 GoodEvent (Int(bitWidth=16, isSigned=true));
  optional int64 field_id=-1 EventTime;
  optional int32 field_id=-1 EventDate (Int(bitWidth=16, isSigned=false));
  optional int32 field_id=-1 CounterID;
  optional int32 field_id=-1 ClientIP;
  optional int32 field_id=-1 RegionID;
  optional int64 field_id=-1 UserID;
  optional int32 field_id=-1 CounterClass (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 OS (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 UserAgent (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 URL;
  optional binary field_id=-1 Referer;
  optional int32 field_id=-1 IsRefresh (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 RefererCategoryID (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 RefererRegionID;
  optional int32 field_id=-1 URLCategoryID (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 URLRegionID;
  optional int32 field_id=-1 ResolutionWidth (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 ResolutionHeight (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 ResolutionDepth (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 FlashMajor (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 FlashMinor (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 FlashMinor2;
  optional int32 field_id=-1 NetMajor (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 NetMinor (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 UserAgentMajor (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 UserAgentMinor;
  optional int32 field_id=-1 CookieEnable (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 JavascriptEnable (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 IsMobile (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 MobilePhone (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 MobilePhoneModel;
  optional binary field_id=-1 Params;
  optional int32 field_id=-1 IPNetworkID;
  optional int32 field_id=-1 TraficSourceID (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 SearchEngineID (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 SearchPhrase;
  optional int32 field_id=-1 AdvEngineID (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 IsArtifical (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 WindowClientWidth (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 WindowClientHeight (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 ClientTimeZone (Int(bitWidth=16, isSigned=true));
  optional int64 field_id=-1 ClientEventTime;
  optional int32 field_id=-1 SilverlightVersion1 (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 SilverlightVersion2 (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 SilverlightVersion3;
  optional int32 field_id=-1 SilverlightVersion4 (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 PageCharset;
  optional int32 field_id=-1 CodeVersion;
  optional int32 field_id=-1 IsLink (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 IsDownload (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 IsNotBounce (Int(bitWidth=16, isSigned=true));
  optional int64 field_id=-1 FUniqID;
  optional binary field_id=-1 OriginalURL;
  optional int32 field_id=-1 HID;
  optional int32 field_id=-1 IsOldCounter (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 IsEvent (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 IsParameter (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 DontCountHits (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 WithHash (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 HitColor;
  optional int64 field_id=-1 LocalEventTime;
  optional int32 field_id=-1 Age (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 Sex (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 Income (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 Interests (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 Robotness (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 RemoteIP;
  optional int32 field_id=-1 WindowName;
  optional int32 field_id=-1 OpenerName;
  optional int32 field_id=-1 HistoryLength (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 BrowserLanguage;
  optional binary field_id=-1 BrowserCountry;
  optional binary field_id=-1 SocialNetwork;
  optional binary field_id=-1 SocialAction;
  optional int32 field_id=-1 HTTPError (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 SendTiming;
  optional int32 field_id=-1 DNSTiming;
  optional int32 field_id=-1 ConnectTiming;
  optional int32 field_id=-1 ResponseStartTiming;
  optional int32 field_id=-1 ResponseEndTiming;
  optional int32 field_id=-1 FetchTiming;
  optional int32 field_id=-1 SocialSourceNetworkID (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 SocialSourcePage;
  optional int64 field_id=-1 ParamPrice;
  optional binary field_id=-1 ParamOrderID;
  optional binary field_id=-1 ParamCurrency;
  optional int32 field_id=-1 ParamCurrencyID (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 OpenstatServiceName;
  optional binary field_id=-1 OpenstatCampaignID;
  optional binary field_id=-1 OpenstatAdID;
  optional binary field_id=-1 OpenstatSourceID;
  optional binary field_id=-1 UTMSource;
  optional binary field_id=-1 UTMMedium;
  optional binary field_id=-1 UTMCampaign;
  optional binary field_id=-1 UTMContent;
  optional binary field_id=-1 UTMTerm;
  optional binary field_id=-1 FromTag;
  optional int32 field_id=-1 HasGCLID (Int(bitWidth=16, isSigned=true));
  optional int64 field_id=-1 RefererHash;
  optional int64 field_id=-1 URLHash;
  optional int32 field_id=-1 CLID;
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions