Skip to content

Commit bc4f6e9

Browse files
author
Tristan Stevens
committed
Year insertion logic for ConvertTimestamp command
1 parent 6e2f510 commit bc4f6e9

File tree

3 files changed

+396
-15
lines changed

3 files changed

+396
-15
lines changed

kite-morphlines/kite-morphlines-core/src/main/java/org/kitesdk/morphline/stdlib/ConvertTimestampBuilder.java

Lines changed: 92 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939
import com.google.common.base.Joiner;
4040
import com.typesafe.config.Config;
4141

42+
import javafx.util.Pair;
43+
4244
/**
4345
* Command that converts the timestamps in a given field from one of a set of input date formats (in
4446
* an input timezone) to an output date format (in an output timezone), while respecting daylight
@@ -60,12 +62,14 @@ public Command build(Config config, Command parent, Command child, MorphlineCont
6062
///////////////////////////////////////////////////////////////////////////////
6163
// Nested classes:
6264
///////////////////////////////////////////////////////////////////////////////
63-
private static final class ConvertTimestamp extends AbstractCommand {
65+
public static final class ConvertTimestamp extends AbstractCommand {
6466

6567
private final String fieldName;
66-
private final List<SimpleDateFormat> inputFormats = new ArrayList<SimpleDateFormat>();
68+
private final List<Pair<SimpleDateFormat, Boolean>> inputFormats = new ArrayList<Pair<SimpleDateFormat,Boolean>>();
6769
private final SimpleDateFormat outputFormat;
6870
private final String inputFormatsDebugString; // cached
71+
private final int insertYearMonthOffset;
72+
private final int insertYearOffset;
6973

7074
private static final String NATIVE_SOLR_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"; // e.g. 2007-04-26T08:05:04.789Z
7175
private static final SimpleDateFormat UNIX_TIME_IN_MILLIS = new SimpleDateFormat("'unixTimeInMillis'");
@@ -75,20 +79,31 @@ private static final class ConvertTimestamp extends AbstractCommand {
7579
DateUtil.DEFAULT_DATE_FORMATS.add(0, NATIVE_SOLR_FORMAT);
7680
}
7781

78-
public ConvertTimestamp(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
82+
ConvertTimestamp(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
7983
super(builder, config, parent, child, context);
8084

8185
this.fieldName = getConfigs().getString(config, "field", Fields.TIMESTAMP);
8286
TimeZone inputTimeZone = getTimeZone(getConfigs().getString(config, "inputTimezone", "UTC"));
8387
Locale inputLocale = getLocale(getConfigs().getString(config, "inputLocale", ""));
88+
89+
boolean insertYear = getConfigs().getBoolean(config, "insertMissingYear", false);
90+
//Defaults to -5 which gives a rolling -11 through +1 month offset (assumes historic messages)
91+
insertYearMonthOffset = getConfigs().getInt(config, "insertMissingYearMonthOffset", -5);
92+
insertYearOffset = getConfigs().getInt(config,"insertMissingYearOffset", 0);
93+
8494
for (String inputFormat : getConfigs().getStringList(config, "inputFormats", DateUtil.DEFAULT_DATE_FORMATS)) {
8595
SimpleDateFormat dateFormat = getUnixTimeFormat(inputFormat, inputTimeZone);
96+
boolean yearRequired = false;
8697
if (dateFormat == null) {
98+
if (insertYear && !inputFormat.contains("yy")) {
99+
inputFormat = "yyyy" + inputFormat;
100+
yearRequired=true;
101+
}
87102
dateFormat = new SimpleDateFormat(inputFormat, inputLocale);
88103
dateFormat.setTimeZone(inputTimeZone);
89104
dateFormat.set2DigitYearStart(DateUtil.DEFAULT_TWO_DIGIT_YEAR_START);
90105
}
91-
this.inputFormats.add(dateFormat);
106+
this.inputFormats.add(new Pair<SimpleDateFormat,Boolean>(dateFormat,yearRequired));
92107
}
93108
TimeZone outputTimeZone = getTimeZone(getConfigs().getString(config, "outputTimezone", "UTC"));
94109
Locale outputLocale = getLocale(getConfigs().getString(config, "outputLocale", ""));
@@ -99,19 +114,23 @@ public ConvertTimestamp(CommandBuilder builder, Config config, Command parent, C
99114
dateFormat.setTimeZone(outputTimeZone);
100115
}
101116
this.outputFormat = dateFormat;
102-
validateArguments();
103-
117+
104118
List<String> inputFormatsStringList = new ArrayList<String>();
105-
for (SimpleDateFormat inputFormat : inputFormats) {
119+
for (Pair<SimpleDateFormat,Boolean> inputFormat : inputFormats) {
106120
// SimpleDateFormat.toString() doesn't print anything useful
107-
inputFormatsStringList.add(inputFormat.toPattern());
121+
inputFormatsStringList.add(inputFormat.getKey().toPattern());
108122
}
109123
this.inputFormatsDebugString = inputFormatsStringList.toString();
110124

125+
126+
validateArguments();
127+
128+
111129
if (LOG.isTraceEnabled()) {
112130
LOG.trace("inputFormatsDebugString: {}", inputFormatsDebugString);
113131
LOG.trace("availableTimeZoneIDs: {}", Joiner.on("\n").join(TimeZone.getAvailableIDs()));
114132
LOG.trace("availableLocales: {}", Joiner.on("\n").join(Locale.getAvailableLocales()));
133+
LOG.trace("insertMissingYear: {}", insertYear);
115134
}
116135
}
117136

@@ -123,7 +142,9 @@ protected boolean doProcess(Record record) {
123142
while (iter.hasNext()) {
124143
String timestamp = iter.next().toString();
125144
boolean foundMatchingFormat = false;
126-
for (SimpleDateFormat inputFormat : inputFormats) {
145+
for (Pair<SimpleDateFormat,Boolean> inputFormatPair : inputFormats) {
146+
SimpleDateFormat inputFormat = inputFormatPair.getKey();
147+
boolean yearRequired = inputFormatPair.getValue();
127148
Date date;
128149
boolean isUnixTime;
129150
if (inputFormat == UNIX_TIME_IN_MILLIS) {
@@ -135,7 +156,15 @@ protected boolean doProcess(Record record) {
135156
} else {
136157
isUnixTime = false;
137158
pos.setIndex(0);
138-
date = inputFormat.parse(timestamp, pos);
159+
if (yearRequired) {
160+
Calendar cal = Calendar.getInstance();
161+
int targetYear = cal.get(Calendar.YEAR) + insertYearOffset;
162+
timestamp = targetYear + timestamp;
163+
date = inputFormat.parse(timestamp, pos);
164+
date = DateUtil.insertYear(date, new Date(), insertYearMonthOffset, targetYear, inputFormat.getTimeZone());
165+
} else {
166+
date = inputFormat.parse(timestamp, pos);
167+
}
139168
}
140169
if (date != null && (isUnixTime || pos.getIndex() == timestamp.length())) {
141170
String result;
@@ -208,6 +237,7 @@ private Locale getLocale(String name) {
208237
}
209238

210239

240+
211241
///////////////////////////////////////////////////////////////////////////////
212242
// Nested classes:
213243
///////////////////////////////////////////////////////////////////////////////
@@ -230,7 +260,7 @@ private Locale getLocale(String name) {
230260
/**
231261
* This class has some code from HttpClient DateUtil and Solrj DateUtil.
232262
*/
233-
private static final class DateUtil {
263+
public static final class DateUtil {
234264
//start HttpClient
235265
/**
236266
* Date format pattern used to parse HTTP date headers in RFC 1123 format.
@@ -259,8 +289,6 @@ private static final class DateUtil {
259289
DEFAULT_TWO_DIGIT_YEAR_START = calendar.getTime();
260290
}
261291

262-
// private static final TimeZone GMT = TimeZone.getTimeZone("GMT");
263-
264292
//end HttpClient
265293

266294
//---------------------------------------------------------------------------------------
@@ -280,7 +308,56 @@ private static final class DateUtil {
280308
DEFAULT_DATE_FORMATS.addAll(DateUtil.DEFAULT_HTTP_CLIENT_PATTERNS);
281309
}
282310

311+
//work around the fact that SimpleDateFormat doesn't handle missing year.
312+
//Code inspired by Flume SyslogParser.java
313+
//https://github.com/apache/flume/blob/trunk/flume-ng-core/src/main/java/org/apache/flume/source/SyslogParser.java
314+
public static Date insertYear(Date inputDate, Date currentDate, int monthOffset, int targetYear, TimeZone tz) {
315+
Calendar cal = Calendar.getInstance();
316+
cal.setTimeZone(tz);
317+
cal.setTime(inputDate);
318+
319+
//There are 12 months in a year. We offer a sliding window, for working out whether the parsed date falls within
320+
//the window (for dealing with year rollover issues).
321+
//Compute the upper and lower bound by moving +6 and -6 by the offset.
322+
int upperBound = monthOffset + 6;
323+
int lowerBound = monthOffset - 6;
324+
325+
//We're now going to check to see whether the date falls outside of the
326+
//upper or lower bounds by intentionally creating the wrong date and seeing
327+
//whether that falls in the past (or future)
328+
Calendar calMinusUpperBMonth = Calendar.getInstance();
329+
calMinusUpperBMonth.setTime(inputDate);
330+
calMinusUpperBMonth.set(Calendar.YEAR, targetYear);
331+
calMinusUpperBMonth.add(Calendar.MONTH, upperBound * -1);
332+
333+
Calendar calPlusLowerBMonths = Calendar.getInstance();
334+
calPlusLowerBMonths.setTime(inputDate);
335+
calPlusLowerBMonths.set(Calendar.YEAR, targetYear);
336+
calPlusLowerBMonths.add(Calendar.MONTH, lowerBound * -1);
337+
338+
Calendar calReferencePoint = Calendar.getInstance();
339+
calReferencePoint.setTime(currentDate);
340+
calReferencePoint.setTimeZone(tz);
341+
calReferencePoint.set(Calendar.YEAR, targetYear);
342+
343+
if (cal.getTimeInMillis() > calReferencePoint.getTimeInMillis() &&
344+
calMinusUpperBMonth.getTimeInMillis() > calReferencePoint.getTimeInMillis()) {
345+
//Date as is stands is in the future and also more than (upper bound) months in the future, therefore rolling back a year.
346+
//Need to roll back a year
347+
cal.add(Calendar.YEAR, -1);
348+
} else if (cal.getTimeInMillis() < calReferencePoint.getTimeInMillis() &&
349+
calPlusLowerBMonths.getTimeInMillis() < calReferencePoint.getTimeInMillis() ) {
350+
//Date as it stands is in the past and indeed more than (lower bound) months in the past
351+
//Need to roll forward a year
352+
cal.add(Calendar.YEAR, -1);
353+
}
354+
// Else it's in the middle and no modification required
355+
356+
return cal.getTime();
357+
358+
}
359+
283360
}
284361
}
285-
286-
}
362+
363+
}

0 commit comments

Comments
 (0)