1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.archive.util;
28
29 import java.io.IOException;
30 import java.io.PrintWriter;
31 import java.io.StringWriter;
32 import java.text.NumberFormat;
33 import java.text.ParseException;
34 import java.text.SimpleDateFormat;
35 import java.util.Calendar;
36 import java.util.Date;
37 import java.util.GregorianCalendar;
38 import java.util.Locale;
39 import java.util.TimeZone;
40
41 /***
42 * Miscellaneous useful methods.
43 *
44 * @author gojomo & others
45 */
46 public class ArchiveUtils {
47
48 /***
49 * Arc-style date stamp in the format yyyyMMddHHmm and UTC time zone.
50 */
51 private static final ThreadLocal<SimpleDateFormat>
52 TIMESTAMP12 = threadLocalDateFormat("yyyyMMddHHmm");;
53
54 /***
55 * Arc-style date stamp in the format yyyyMMddHHmmss and UTC time zone.
56 */
57 private static final ThreadLocal<SimpleDateFormat>
58 TIMESTAMP14 = threadLocalDateFormat("yyyyMMddHHmmss");
59 /***
60 * Arc-style date stamp in the format yyyyMMddHHmmssSSS and UTC time zone.
61 */
62 private static final ThreadLocal<SimpleDateFormat>
63 TIMESTAMP17 = threadLocalDateFormat("yyyyMMddHHmmssSSS");
64
65 /***
66 * Log-style date stamp in the format yyyy-MM-dd'T'HH:mm:ss.SSS'Z'
67 * UTC time zone is assumed.
68 */
69 private static final ThreadLocal<SimpleDateFormat>
70 TIMESTAMP17ISO8601Z = threadLocalDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
71
72 /***
73 * Log-style date stamp in the format yyyy-MM-dd'T'HH:mm:ss'Z'
74 * UTC time zone is assumed.
75 */
76 private static final ThreadLocal<SimpleDateFormat>
77 TIMESTAMP14ISO8601Z = threadLocalDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
78
79 /***
80 * Default character to use padding strings.
81 */
82 private static final char DEFAULT_PAD_CHAR = ' ';
83
84 /*** milliseconds in an hour */
85 private static final int HOUR_IN_MS = 60 * 60 * 1000;
86 /*** milliseconds in a day */
87 private static final int DAY_IN_MS = 24 * HOUR_IN_MS;
88
89 private static ThreadLocal<SimpleDateFormat> threadLocalDateFormat(final String pattern) {
90 ThreadLocal<SimpleDateFormat> tl = new ThreadLocal<SimpleDateFormat>() {
91 protected SimpleDateFormat initialValue() {
92 SimpleDateFormat df = new SimpleDateFormat(pattern);
93 df.setTimeZone(TimeZone.getTimeZone("GMT"));
94 return df;
95 }
96 };
97 return tl;
98 }
99
100 public static int MAX_INT_CHAR_WIDTH =
101 Integer.toString(Integer.MAX_VALUE).length();
102
103 /***
104 * Utility function for creating arc-style date stamps
105 * in the format yyyMMddHHmmssSSS.
106 * Date stamps are in the UTC time zone
107 * @return the date stamp
108 */
109 public static String get17DigitDate(){
110 return TIMESTAMP17.get().format(new Date());
111 }
112
113 /***
114 * Utility function for creating arc-style date stamps
115 * in the format yyyMMddHHmmss.
116 * Date stamps are in the UTC time zone
117 * @return the date stamp
118 */
119 public static String get14DigitDate(){
120 return TIMESTAMP14.get().format(new Date());
121 }
122
123 /***
124 * Utility function for creating arc-style date stamps
125 * in the format yyyMMddHHmm.
126 * Date stamps are in the UTC time zone
127 * @return the date stamp
128 */
129 public static String get12DigitDate(){
130 return TIMESTAMP12.get().format(new Date());
131 }
132
133 /***
134 * Utility function for creating log timestamps, in
135 * W3C/ISO8601 format, assuming UTC. Use current time.
136 *
137 * Format is yyyy-MM-dd'T'HH:mm:ss.SSS'Z'
138 *
139 * @return the date stamp
140 */
141 public static String getLog17Date(){
142 return TIMESTAMP17ISO8601Z.get().format(new Date());
143 }
144
145 /***
146 * Utility function for creating log timestamps, in
147 * W3C/ISO8601 format, assuming UTC.
148 *
149 * Format is yyyy-MM-dd'T'HH:mm:ss.SSS'Z'
150 * @param date Date to format.
151 *
152 * @return the date stamp
153 */
154 public static String getLog17Date(long date){
155 return TIMESTAMP17ISO8601Z.get().format(new Date(date));
156 }
157
158 /***
159 * Utility function for creating log timestamps, in
160 * W3C/ISO8601 format, assuming UTC. Use current time.
161 *
162 * Format is yyyy-MM-dd'T'HH:mm:ss'Z'
163 *
164 * @return the date stamp
165 */
166 public static String getLog14Date(){
167 return TIMESTAMP14ISO8601Z.get().format(new Date());
168 }
169
170 /***
171 * Utility function for creating log timestamps, in
172 * W3C/ISO8601 format, assuming UTC.
173 *
174 * Format is yyyy-MM-dd'T'HH:mm:ss'Z'
175 * @param date long timestamp to format.
176 *
177 * @return the date stamp
178 */
179 public static String getLog14Date(long date){
180 return TIMESTAMP14ISO8601Z.get().format(new Date(date));
181 }
182
183 /***
184 * Utility function for creating log timestamps, in
185 * W3C/ISO8601 format, assuming UTC.
186 *
187 * Format is yyyy-MM-dd'T'HH:mm:ss'Z'
188 * @param date Date to format.
189 *
190 * @return the date stamp
191 */
192 public static String getLog14Date(Date date){
193 return TIMESTAMP14ISO8601Z.get().format(date);
194 }
195
196 /***
197 * Utility function for creating arc-style date stamps
198 * in the format yyyyMMddHHmmssSSS.
199 * Date stamps are in the UTC time zone
200 *
201 * @param date milliseconds since epoc
202 * @return the date stamp
203 */
204 public static String get17DigitDate(long date){
205 return TIMESTAMP17.get().format(new Date(date));
206 }
207
208 public static String get17DigitDate(Date date){
209 return TIMESTAMP17.get().format(date);
210 }
211
212 /***
213 * Utility function for creating arc-style date stamps
214 * in the format yyyyMMddHHmmss.
215 * Date stamps are in the UTC time zone
216 *
217 * @param date milliseconds since epoc
218 * @return the date stamp
219 */
220 public static String get14DigitDate(long date){
221 return TIMESTAMP14.get().format(new Date(date));
222 }
223
224 public static String get14DigitDate(Date d) {
225 return TIMESTAMP14.get().format(d);
226 }
227
228 /***
229 * Utility function for creating arc-style date stamps
230 * in the format yyyyMMddHHmm.
231 * Date stamps are in the UTC time zone
232 *
233 * @param date milliseconds since epoc
234 * @return the date stamp
235 */
236 public static String get12DigitDate(long date){
237 return TIMESTAMP12.get().format(new Date(date));
238 }
239
240 public static String get12DigitDate(Date d) {
241 return TIMESTAMP12.get().format(d);
242 }
243
244 /***
245 * Parses an ARC-style date. If passed String is < 12 characters in length,
246 * we pad. At a minimum, String should contain a year (>=4 characters).
247 * Parse will also fail if day or month are incompletely specified. Depends
248 * on the above getXXDigitDate methods.
249 * @param A 4-17 digit date in ARC style (<code>yyyy</code> to
250 * <code>yyyyMMddHHmmssSSS</code>) formatting.
251 * @return A Date object representing the passed String.
252 * @throws ParseException
253 */
254 public static Date getDate(String d) throws ParseException {
255 Date date = null;
256 if (d == null) {
257 throw new IllegalArgumentException("Passed date is null");
258 }
259 switch (d.length()) {
260 case 14:
261 date = ArchiveUtils.parse14DigitDate(d);
262 break;
263
264 case 17:
265 date = ArchiveUtils.parse17DigitDate(d);
266 break;
267
268 case 12:
269 date = ArchiveUtils.parse12DigitDate(d);
270 break;
271
272 case 0:
273 case 1:
274 case 2:
275 case 3:
276 throw new ParseException("Date string must at least contain a" +
277 "year: " + d, d.length());
278
279 default:
280 if (!(d.startsWith("19") || d.startsWith("20"))) {
281 throw new ParseException("Unrecognized century: " + d, 0);
282 }
283 if (d.length() < 8 && (d.length() % 2) != 0) {
284 throw new ParseException("Incomplete month/date: " + d,
285 d.length());
286 }
287 StringBuilder sb = new StringBuilder(d);
288 if (sb.length() < 8) {
289 for (int i = sb.length(); sb.length() < 8; i += 2) {
290 sb.append("01");
291 }
292 }
293 if (sb.length() < 12) {
294 for (int i = sb.length(); sb.length() < 12; i++) {
295 sb.append("0");
296 }
297 }
298 date = ArchiveUtils.parse12DigitDate(sb.toString());
299 }
300
301 return date;
302 }
303
304 /***
305 * Utility function for parsing arc-style date stamps
306 * in the format yyyMMddHHmmssSSS.
307 * Date stamps are in the UTC time zone. The whole string will not be
308 * parsed, only the first 17 digits.
309 *
310 * @param date an arc-style formatted date stamp
311 * @return the Date corresponding to the date stamp string
312 * @throws ParseException if the inputstring was malformed
313 */
314 public static Date parse17DigitDate(String date) throws ParseException {
315 return TIMESTAMP17.get().parse(date);
316 }
317
318 /***
319 * Utility function for parsing arc-style date stamps
320 * in the format yyyMMddHHmmss.
321 * Date stamps are in the UTC time zone. The whole string will not be
322 * parsed, only the first 14 digits.
323 *
324 * @param date an arc-style formatted date stamp
325 * @return the Date corresponding to the date stamp string
326 * @throws ParseException if the inputstring was malformed
327 */
328 public static Date parse14DigitDate(String date) throws ParseException{
329 return TIMESTAMP14.get().parse(date);
330 }
331
332 /***
333 * Utility function for parsing arc-style date stamps
334 * in the format yyyMMddHHmm.
335 * Date stamps are in the UTC time zone. The whole string will not be
336 * parsed, only the first 12 digits.
337 *
338 * @param date an arc-style formatted date stamp
339 * @return the Date corresponding to the date stamp string
340 * @throws ParseException if the inputstring was malformed
341 */
342 public static Date parse12DigitDate(String date) throws ParseException{
343 return TIMESTAMP12.get().parse(date);
344 }
345
346 /***
347 * Convert 17-digit date format timestamps (as found in crawl.log, for
348 * example) into a GregorianCalendar object. + * Useful so you can convert
349 * into milliseconds-since-epoch. Note: it is possible to compute
350 * milliseconds-since-epoch + * using {@link #parse17DigitDate}.UTC(), but
351 * that method is deprecated in favor of using Calendar.getTimeInMillis(). + *
352 * <p/>I probably should have dug into all the utility methods in
353 * DateFormat.java to parse the timestamp, but this was + * easier. If
354 * someone wants to fix this to use those methods, please have at it! <p/>
355 * Mike Schwartz, schwartz at CodeOnTheRoad dot com.
356 *
357 * @param timestamp17String
358 * @return Calendar set to <code>timestamp17String</code>.
359 */
360 public static Calendar timestamp17ToCalendar(String timestamp17String) {
361 GregorianCalendar calendar = new GregorianCalendar();
362 int year = Integer.parseInt(timestamp17String.substring(0, 4));
363 int dayOfMonth = Integer.parseInt(timestamp17String.substring(6, 8));
364
365 int month = Integer.parseInt(timestamp17String.substring(4, 6)) - 1;
366 int hourOfDay = Integer.parseInt(timestamp17String.substring(8, 10));
367 int minute = Integer.parseInt(timestamp17String.substring(10, 12));
368 int second = Integer.parseInt(timestamp17String.substring(12, 14));
369 int milliseconds = Integer
370 .parseInt(timestamp17String.substring(14, 17));
371 calendar.set(Calendar.YEAR, year);
372 calendar.set(Calendar.MONTH, month);
373 calendar.set(Calendar.DAY_OF_MONTH, dayOfMonth);
374 calendar.set(Calendar.HOUR_OF_DAY, hourOfDay);
375 calendar.set(Calendar.MINUTE, minute);
376 calendar.set(Calendar.SECOND, second);
377 calendar.set(Calendar.MILLISECOND, milliseconds);
378 return calendar;
379 }
380
381 /***
382 * @param timestamp A 14-digit timestamp or the suffix for a 14-digit
383 * timestamp: E.g. '20010909014640' or '20010101' or '1970'.
384 * @return Seconds since the epoch as a string zero-pre-padded so always
385 * Integer.MAX_VALUE wide (Makes it so sorting of resultant string works
386 * properly).
387 * @throws ParseException
388 */
389 public static String secondsSinceEpoch(String timestamp)
390 throws ParseException {
391 return zeroPadInteger((int)
392 (getSecondsSinceEpoch(timestamp).getTime()/1000));
393 }
394
395 /***
396 * @param timestamp A 14-digit timestamp or the suffix for a 14-digit
397 * timestamp: E.g. '20010909014640' or '20010101' or '1970'.
398 * @return A date.
399 * @see #secondsSinceEpoch(String)
400 * @throws ParseException
401 */
402 public static Date getSecondsSinceEpoch(String timestamp)
403 throws ParseException {
404 if (timestamp.length() < 14) {
405 if (timestamp.length() < 10 && (timestamp.length() % 2) == 1) {
406 throw new IllegalArgumentException("Must have year, " +
407 "month, date, hour or second granularity: " + timestamp);
408 }
409 if (timestamp.length() == 4) {
410
411 timestamp = timestamp + "01010000";
412 }
413 if (timestamp.length() == 6) {
414
415 timestamp = timestamp + "010000";
416 }
417 if (timestamp.length() < 14) {
418 timestamp = timestamp +
419 ArchiveUtils.padTo("", 14 - timestamp.length(), '0');
420 }
421 }
422 return ArchiveUtils.parse14DigitDate(timestamp);
423 }
424
425 /***
426 * @param i Integer to add prefix of zeros too. If passed
427 * 2005, will return the String <code>0000002005</code>. String
428 * width is the width of Integer.MAX_VALUE as a string (10
429 * digits).
430 * @return Padded String version of <code>i</code>.
431 */
432 public static String zeroPadInteger(int i) {
433 return ArchiveUtils.padTo(Integer.toString(i),
434 MAX_INT_CHAR_WIDTH, '0');
435 }
436
437 /***
438 * Convert an <code>int</code> to a <code>String</code>, and pad it to
439 * <code>pad</code> spaces.
440 * @param i the int
441 * @param pad the width to pad to.
442 * @return String w/ padding.
443 */
444 public static String padTo(final int i, final int pad) {
445 String n = Integer.toString(i);
446 return padTo(n, pad);
447 }
448
449 /***
450 * Pad the given <code>String</code> to <code>pad</code> characters wide
451 * by pre-pending spaces. <code>s</code> should not be <code>null</code>.
452 * If <code>s</code> is already wider than <code>pad</code> no change is
453 * done.
454 *
455 * @param s the String to pad
456 * @param pad the width to pad to.
457 * @return String w/ padding.
458 */
459 public static String padTo(final String s, final int pad) {
460 return padTo(s, pad, DEFAULT_PAD_CHAR);
461 }
462
463 /***
464 * Pad the given <code>String</code> to <code>pad</code> characters wide
465 * by pre-pending <code>padChar</code>.
466 *
467 * <code>s</code> should not be <code>null</code>. If <code>s</code> is
468 * already wider than <code>pad</code> no change is done.
469 *
470 * @param s the String to pad
471 * @param pad the width to pad to.
472 * @param padChar The pad character to use.
473 * @return String w/ padding.
474 */
475 public static String padTo(final String s, final int pad,
476 final char padChar) {
477 String result = s;
478 int l = s.length();
479 if (l < pad) {
480 StringBuffer sb = new StringBuffer(pad);
481 while(l < pad) {
482 sb.append(padChar);
483 l++;
484 }
485 sb.append(s);
486 result = sb.toString();
487 }
488 return result;
489 }
490
491 /*** check that two byte arrays are equal. They may be <code>null</code>.
492 *
493 * @param lhs a byte array
494 * @param rhs another byte array.
495 * @return <code>true</code> if they are both equal (or both
496 * <code>null</code>)
497 */
498 public static boolean byteArrayEquals(final byte[] lhs, final byte[] rhs) {
499 if (lhs == null && rhs != null || lhs != null && rhs == null) {
500 return false;
501 }
502 if (lhs==rhs) {
503 return true;
504 }
505 if (lhs.length != rhs.length) {
506 return false;
507 }
508 for(int i = 0; i<lhs.length; i++) {
509 if (lhs[i]!=rhs[i]) {
510 return false;
511 }
512 }
513 return true;
514 }
515
516 /***
517 * Converts a double to a string.
518 * @param val The double to convert
519 * @param precision How many characters to include after '.'
520 * @return the double as a string.
521 */
522 public static String doubleToString(double val, int maxFractionDigits){
523 return doubleToString(val, maxFractionDigits, 0);
524 }
525
526 private static String doubleToString(double val, int maxFractionDigits, int minFractionDigits) {
527 NumberFormat f = NumberFormat.getNumberInstance(Locale.US);
528 f.setMaximumFractionDigits(maxFractionDigits);
529 f.setMinimumFractionDigits(minFractionDigits);
530 return f.format(val);
531 }
532
533 /***
534 * Takes a byte size and formats it for display with 'friendly' units.
535 * <p>
536 * This involves converting it to the largest unit
537 * (of B, KB, MB, GB, TB) for which the amount will be > 1.
538 * <p>
539 * Additionally, at least 2 significant digits are always displayed.
540 * <p>
541 * Displays as bytes (B): 0-1023
542 * Displays as kilobytes (KB): 1024 - 2097151 (~2Mb)
543 * Displays as megabytes (MB): 2097152 - 4294967295 (~4Gb)
544 * Displays as gigabytes (GB): 4294967296 - infinity
545 * <p>
546 * Negative numbers will be returned as '0 B'.
547 *
548 * @param amount the amount of bytes
549 * @return A string containing the amount, properly formated.
550 */
551 public static String formatBytesForDisplay(long amount) {
552 double displayAmount = (double) amount;
553 int unitPowerOf1024 = 0;
554
555 if(amount <= 0){
556 return "0 B";
557 }
558
559 while(displayAmount>=1024 && unitPowerOf1024 < 4) {
560 displayAmount = displayAmount / 1024;
561 unitPowerOf1024++;
562 }
563
564
565 final String[] units = { " B", " KB", " MB", " GB", " TB" };
566
567
568 int fractionDigits = (displayAmount < 10) ? 1 : 0;
569 return doubleToString(displayAmount, fractionDigits, fractionDigits)
570 + units[unitPowerOf1024];
571 }
572
573 /***
574 * Convert milliseconds value to a human-readable duration
575 * @param time
576 * @return Human readable string version of passed <code>time</code>
577 */
578 public static String formatMillisecondsToConventional(long time) {
579 return formatMillisecondsToConventional(time,true);
580 }
581
582 /***
583 * Convert milliseconds value to a human-readable duration
584 * @param time
585 * @param toMs whether to print to the ms
586 * @return Human readable string version of passed <code>time</code>
587 */
588 public static String formatMillisecondsToConventional(long time, boolean toMs) {
589 StringBuffer sb = new StringBuffer();
590 if(time<0) {
591 sb.append("-");
592 }
593 long absTime = Math.abs(time);
594 if(!toMs && absTime < 1000) {
595 return "0s";
596 }
597 if(absTime > DAY_IN_MS) {
598
599 sb.append(absTime / DAY_IN_MS + "d");
600 absTime = absTime % DAY_IN_MS;
601 }
602 if (absTime > HOUR_IN_MS) {
603
604 sb.append(absTime / HOUR_IN_MS + "h");
605 absTime = absTime % HOUR_IN_MS;
606 }
607 if (absTime > 60000) {
608 sb.append(absTime / 60000 + "m");
609 absTime = absTime % 60000;
610 }
611 if (absTime > 1000) {
612 sb.append(absTime / 1000 + "s");
613 absTime = absTime % 1000;
614 }
615 if(toMs) {
616 sb.append(absTime + "ms");
617 }
618 return sb.toString();
619 }
620
621
622 /***
623 * Generate a long UID based on the given class and version number.
624 * Using this instead of the default will assume serialization
625 * compatibility across class changes unless version number is
626 * intentionally bumped.
627 *
628 * @param class1
629 * @param version
630 * @return UID based off class and version number.
631 */
632 public static long classnameBasedUID(Class class1, int version) {
633 String callingClassname = class1.getName();
634 return (long)callingClassname.hashCode() << 32 + version;
635 }
636
637 /***
638 * Copy the raw bytes of a long into a byte array, starting at
639 * the specified offset.
640 *
641 * @param l
642 * @param array
643 * @param offset
644 */
645 public static void longIntoByteArray(long l, byte[] array, int offset) {
646 int i, shift;
647
648 for(i = 0, shift = 56; i < 8; i++, shift -= 8)
649 array[offset+i] = (byte)(0xFF & (l >> shift));
650 }
651
652 public static long byteArrayIntoLong(byte [] bytearray) {
653 return byteArrayIntoLong(bytearray, 0);
654 }
655
656 /***
657 * Byte array into long.
658 * @param bytearray Array to convert to a long.
659 * @param offset Offset into array at which we start decoding the long.
660 * @return Long made of the bytes of <code>array</code> beginning at
661 * offset <code>offset</code>.
662 * @see #longIntoByteArray(long, byte[], int)
663 */
664 public static long byteArrayIntoLong(byte [] bytearray,
665 int offset) {
666 long result = 0;
667 for (int i = offset; i < 8
668 result = (result << 8
669 (0xff & (byte)(bytearray[i] & 0xff));
670 }
671 return result;
672 }
673
674 /***
675 * Given a string that may be a plain host or host/path (without
676 * URI scheme), add an implied http:// if necessary.
677 *
678 * @param u string to evaluate
679 * @return string with http:// added if no scheme already present
680 */
681 public static String addImpliedHttpIfNecessary(String u) {
682 if(u.indexOf(':') == -1 || u.indexOf('.') < u.indexOf(':')) {
683
684 u = "http://" + u;
685 }
686 return u;
687 }
688
689 /***
690 * Verify that the array begins with the prefix.
691 *
692 * @param array
693 * @param prefix
694 * @return true if array is identical to prefix for the first prefix.length
695 * positions
696 */
697 public static boolean startsWith(byte[] array, byte[] prefix) {
698 if(prefix.length>array.length) {
699 return false;
700 }
701 for(int i = 0; i < prefix.length; i++) {
702 if(array[i]!=prefix[i]) {
703 return false;
704 }
705 }
706 return true;
707 }
708
709 /***
710 * Utility method to get a String singleLineReport from Reporter
711 * @param rep Reporter to get singleLineReport from
712 * @return String of report
713 */
714 public static String singleLineReport(Reporter rep) {
715 StringWriter sw = new StringWriter();
716 PrintWriter pw = new PrintWriter(sw);
717 try {
718 rep.singleLineReportTo(pw);
719 } catch (IOException e) {
720
721 e.printStackTrace();
722 }
723 pw.flush();
724 return sw.toString();
725 }
726
727 /***
728 * Compose the requested report into a String. DANGEROUS IF REPORT
729 * CAN BE LARGE.
730 *
731 * @param rep Reported
732 * @param name String name of report to compose
733 * @return String of report
734 */
735 public static String writeReportToString(Reporter rep, String name) {
736 StringWriter sw = new StringWriter();
737 PrintWriter pw = new PrintWriter(sw);
738 rep.reportTo(name,pw);
739 pw.flush();
740 return sw.toString();
741 }
742 }
743