001package votorola.a.diff.harvest; // Copyright 2012, Christian Weilbach. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Votorola Software"), to deal in the Votorola Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Votorola Software, and to permit persons to whom the Votorola Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Votorola Software. THE VOTOROLA SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE VOTOROLA SOFTWARE OR THE USE OR OTHER DEALINGS IN THE VOTOROLA SOFTWARE. 002 003import java.io.BufferedReader; 004import java.io.IOException; 005import java.io.InputStream; 006import java.io.InputStreamReader; 007import java.io.PrintStream; 008import java.io.UnsupportedEncodingException; 009 010import java.sql.SQLException; 011import java.text.DateFormatSymbols; 012import java.text.ParseException; 013import java.text.SimpleDateFormat; 014import java.util.Arrays; 015import java.util.Collections; 016import java.util.Date; 017import java.util.HashMap; 018import java.util.LinkedList; 019import java.util.List; 020import java.util.Locale; 021import java.util.Map; 022import java.util.TimeZone; 023import java.util.concurrent.atomic.AtomicBoolean; 024import java.util.logging.Level; 025import java.util.logging.Logger; 026import java.util.regex.Matcher; 027import java.util.regex.Pattern; 028 029import org.apache.commons.lang3.StringEscapeUtils; 030 031import com.ibm.icu.util.Calendar; 032import com.ibm.icu.util.GregorianCalendar; 033 034import votorola.a.diff.harvest.auth.BasicAuthenticator; 035import votorola.a.diff.harvest.cache.HarvestCache; 036import votorola.a.diff.harvest.kick.Kick; 037import votorola.a.diff.harvest.kick.KickReceiver; 038import votorola.a.diff.harvest.kick.Kicker; 039import votorola.a.diff.harvest.kick.UpdateKick; 040import votorola.a.diff.harvest.run.AbstractFetcher; 041import votorola.a.diff.harvest.run.Fetcher; 042import votorola.a.diff.harvest.run.HarvestRunner; 043import votorola.g.lang.ThreadSafe; 044import votorola.g.lang.Warning; 045import votorola.g.logging.LoggerX; 046 047/** 048 * <p> 049 * A harvester implementation for pipermail archives. Pipermail archives are 050 * generated by <a href="http://en.wikipedia.org/wiki/GNU_Mailman">mailman</a>. 051 * Harvested messages are stored in {@linkplain HarvestCache} and fetchers are 052 * scheduled with {@linkplain HarvestRunner}. 053 * </p> 054 * 055 * <p> 056 * Basically we do a <a 057 * href="http://en.wikipedia.org/wiki/Depth-first_search">depth first search</a> 058 * sorted by date. Web-view on the remote archive for this harvester: 059 * </p> 060 * <br/> 061 * 062 * <pre *> 063 * <b>Tree of linked pages. Level Example remote archive URLs</b> 064 * r1 {@linkplain RootFetcher root} <i>http://mail.zelea.com/list/votorola/</i> 065 * / \ 066 * / \ 067 * / \ 068 * / \ 069 * i2 i3 {@linkplain PeriodFetcher index} <i>2010-Jan/date.html</i> 070 * / \ /|\ 071 * / \ / | \ 072 * m4 m5 m6 m7 m8 {@linkplain MessageFetcher message} <i>2010-Jan/003321.html</i> 073 * </pre> 074 * 075 * <p> 076 * All fetchers depend on <b>one</b> HTTP request. Runtime steps for these 077 * {@linkplain AbstractFetcher Fetchfetchers}: 078 * <ol> 079 * <li>Create fetcher and {@linkplain HarvestRunner#scheduleLast(Fetcher) 080 * schedule} it</li> 081 * <li>Once {@linkplain HarvestRunner} handles the fetcher, it asynchronously 082 * fetches the HTTP remote page and runs the fetcher in its thread pool</li> 083 * <li>Having access to {@linkplain AbstractFetcher#getInputStream()} now, the 084 * fetcher parses the page</li> 085 * <li>r1, i2, i3 start fetchers for the index of URLs parsed out of the page</li> 086 * </ol> 087 * </p> 088 * <p> 089 * Result after r1: <b>i2-i3</b>, after i2: <b>m4-m5-i3</b> after i3: 090 * <b>m6-m7-m8</b>. <br/> 091 * The state is save by anonymously extending the last Messagefetcher like m5 or 092 * m8 appropriately. 093 * </p> 094 * <p> 095 * <b>Note:</b> This is only an example. You can submit your 096 * {@linkplain votorola.a.diff.harvest.run.Fetcher} differently and you can also 097 * save state differently. Usage of the {@linkplain HarvestRunner} is 098 * recommended for graceful I/O handling though. 099 * </p> 100 * 101 * @see votorola.a.diff.harvest The communication diagram for details of the 102 * overall harvesting concept. 103 */ 104public class PipermailHarvester { 105 106 private final static Logger LOGGER = LoggerX.i(PipermailHarvester.class); 107 108 // / / / Harvesting services 109 110 private final HarvestCache hCache; 111 112 /** 113 * Since PipermailHarvester.PAT_AUTHOR allows us to parse the E-Mail out of 114 * the page, we can generate the MailishUsername directly and therefore use 115 * the verifier which just compares mailish usernames of the auhors of the 116 * difference with the author of the message. 117 */ 118 private final BasicAuthenticator aVerifier; 119 120 /** 121 * Global singleton scheduler. 122 */ 123 private final HarvestRunner runner = HarvestRunner.i(); 124 125 // / / / Forum specific settings 126 127 // list-info page 128 129 /** 130 * Pattern for list-info page. (for setup) 131 */ 132 public final static Pattern PAT_LISTINFO = Pattern 133 .compile("<a href=\"(.*)\">"); 134 135 // root page 136 137 /** 138 * Find input encoding. 139 */ 140 public final static Pattern PAT_INPUTENC = Pattern 141 .compile("text/html; charset=([a-z0-9\\-]+)"); 142 143 /** 144 * Parse language. 145 */ 146 public final static Pattern PAT_LANG = Pattern 147 .compile("<INPUT name=\"language\" type=\"HIDDEN\" value=\"(\\w+)\" >"); 148 149 /** 150 * Pattern to parse sub-list of posts for each month in 151 * {@linkplain RootFetcher}. If period is in years: 2012/date.html ... in 152 * months: 2012-October/date.html TODO add/fix weekly pattern if used 153 * somewhere 154 */ 155 public final static Pattern PAT_PERIOD = Pattern 156 .compile("=\"((\\d+)-?(\\w+)?)/date.html\"\\>\\[ \\S+ \\]"); 157 /** 158 * Pattern to parse list of posts in this month list in 159 * {@linkplain PeriodFetcher}. 160 */ 161 public final static Pattern PAT_POST = Pattern 162 .compile("=\"(\\S+\\.html)\"\\>"); 163 164 /** 165 * Pattern to find author email in {@linkplain MessageFetcher}. 166 */ 167 public final static Pattern PAT_AUTHOR = Pattern.compile("HREF=\"mailto"); 168 /** 169 * Pattern to parse author email in {@linkplain MessageFetcher}. 170 */ 171 public final static Pattern PAT_AUTHOR2 = Pattern.compile("TITLE.+\\>(.+)"); 172 /** 173 * Pattern to parse sent date in {@linkplain MessageFetcher}. 174 */ 175 public final static Pattern PAT_SENTDATE = Pattern 176 .compile("<I>(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)</I>"); 177 // example with grouping: (Di )(Okt )(30 )(23:48:50 )(CET )(2012) 178 // or: (Jeu )(1 )(Nov )(12:00:20 )(UTC )(2012) 179 // needed to always parse English timezone in Messagefetcher 180 /** 181 * Pattern to find the start of the body of the message in 182 * {@linkplain MessageFetcher}. 183 */ 184 public final static Pattern PAT_CONTENT_START = Pattern 185 .compile("\\<PRE\\>"); 186 /** 187 * Pattern to find the end of the body of the message in 188 * {@linkplain MessageFetcher}. 189 */ 190 public final static Pattern PAT_CONTENT_END = Pattern.compile("\\</PRE\\>"); 191 192 private final StateTable stateTable; 193 194 /** 195 * Structure to save state of a single archive. <b>declared package private 196 * for example documentation</b> 197 */ 198 @Warning("non-API") 199 @ThreadSafe 200 class Archive { 201 202 class SetupFetcher extends AbstractFetcher { 203 final Archive archive; 204 205 public SetupFetcher(final Archive archive) { 206 super(archive.archiveUrl(), ""); 207 this.archive = archive; 208 } 209 210 @Override 211 public void run() { 212 BufferedReader in; 213 String tmpListInfo = ""; 214 String enc = ""; 215 try { 216 // expect the first fetch to work for html parsing only with 217 // iso-8859-1 encoding (?) 218 in = new BufferedReader(new InputStreamReader( 219 getInputStream(), "iso-8859-1")); 220 221 for (String line = in.readLine(); line != null; line = in 222 .readLine()) { 223 Matcher m; 224 m = PAT_INPUTENC.matcher(line); 225 if (m.find()) { 226 enc = m.group(1); 227 archive.setInputEncoding(enc); 228 } 229 230 m = PAT_LISTINFO.matcher(line); 231 if (m.find()) { 232 tmpListInfo = m.group(1); 233 } 234 } 235 } catch (UnsupportedEncodingException e) { 236 final String msg = "Bug: Could not parse list-info HTML with iso-8859-1."; 237 LOGGER.severe(msg); 238 archive.setFaulty(msg); 239 } catch (IOException e) { 240 final String msg = "Bug: Could not get list-info HTML."; 241 LOGGER.severe(msg); 242 archive.setFaulty(msg); 243 } 244 245 if (enc.isEmpty() || tmpListInfo.isEmpty()) { 246 final String msg = "Could parse setup for archive:" 247 + (enc.isEmpty() ? " encoding not found" : "") 248 + (tmpListInfo.isEmpty() ? ", list-info page not found" 249 : ""); 250 LOGGER.severe(msg); 251 archive.setFaulty(msg); 252 return; 253 } 254 archive.setInputEncoding(enc); 255 final String listInfo = tmpListInfo; 256 257 runner.scheduleFirst(new Fetcher() { 258 259 @Override 260 public void run() { 261 final String enc = archive.inputEncoding(); 262 263 final String langCode = indexPage(inputStream, 264 PAT_LANG, enc).get(0); 265 final Locale locale = new Locale(langCode); 266 267 archive.setLocale(locale); 268 } 269 270 @Override 271 public void fault(final String msg) { 272 archive.setFaulty(msg); 273 } 274 275 // / / Fetcher 276 277 @Override 278 public String archiveUrl() { 279 return archiveUrl; 280 } 281 282 @Override 283 public String url() { 284 return listInfo; 285 } 286 287 private InputStream inputStream; 288 289 @Override 290 public void setInputStream(InputStream in) { 291 this.inputStream = in; 292 } 293 294 @Override 295 public void setStatusCode(int code) { 296 if (code >= 400) { 297 final String error = "Failed to fetch list-info page: " 298 + listInfo; 299 LOGGER.warning(error); 300 archive.setFaulty(error); 301 } 302 } 303 304 }); 305 } 306 307 @Override 308 public void fault(final String msg) { 309 archive.setFaulty(msg); 310 } 311 } 312 313 private final String archiveUrl; 314 315 public Archive(final String archiveUrl) { 316 this.archiveUrl = archiveUrl; 317 runner.scheduleFirst(new SetupFetcher(this)); 318 } 319 320 public synchronized String archiveUrl() { 321 return archiveUrl; 322 } 323 324 private volatile boolean isFaulty = false; 325 326 public synchronized boolean isFaulty() { 327 return isFaulty; 328 } 329 330 private volatile String faultMsg = ""; 331 332 public synchronized String fault() { 333 return faultMsg; 334 } 335 336 public synchronized void setFaulty(final String msg) { 337 this.faultMsg = msg; 338 isFaulty = true; 339 } 340 341 private volatile Locale locale = null; 342 343 public synchronized void setLocale(final Locale locale) { 344 if (locale == null) { 345 final String msg = "Could not parse locale, is null."; 346 LOGGER.severe(msg); 347 setFaulty(msg); 348 return; 349 } 350 this.locale = locale; 351 } 352 353 private volatile String inputEncoding = ""; 354 355 public synchronized void setInputEncoding(final String encoding) { 356 if (encoding.isEmpty()) { 357 final String msg = "Parsed encoding is empty."; 358 LOGGER.severe(msg); 359 setFaulty(msg); 360 return; 361 } 362 inputEncoding = encoding; 363 } 364 365 public synchronized String inputEncoding() { 366 return inputEncoding; 367 } 368 369 /** 370 * Whether the archive has been updated in this instance yet. 371 */ 372 public final AtomicBoolean firstRun = new AtomicBoolean(true); 373 374 private volatile Date nextPeriodStart = new Date(0); 375 376 private volatile String currentPeriod = ""; 377 378 /** 379 * This determines whether UpdateContext will lookup the current month 380 * directly or do an index update if the month has changed. 381 * 382 * @param current 383 */ 384 public synchronized void setCurrentPeriod(String current) { 385 this.currentPeriod = current; 386 387 String format = "yyyy"; 388 final Matcher m = PAT_PERIOD.matcher(current); 389 boolean monthly = false; 390 if (m.find()) { 391 final String month = m.group(3); 392 if (!month.isEmpty()) { 393 monthly = true; 394 format += "-MMMM"; 395 } 396 } 397 398 SimpleDateFormat sdf = new SimpleDateFormat(format, Locale.ENGLISH); 399 GregorianCalendar cal = new GregorianCalendar(); 400 try { 401 cal.setTime(sdf.parse(current)); 402 } catch (ParseException e) { 403 LOGGER.log(Level.WARNING, "Cannot parse date from: " + current, 404 e); 405 } 406 cal.roll(monthly ? Calendar.MONTH : Calendar.YEAR, true); 407 this.nextPeriodStart = cal.getTime(); 408 } 409 410 public synchronized Date nextPeriodStart() { 411 return nextPeriodStart; 412 } 413 414 public synchronized String currentPeriod() { 415 return currentPeriod; 416 } 417 418 public synchronized Locale locale() { 419 return locale; 420 } 421 422 } 423 424 /** 425 * Tracks all configured archives. Key is the base URL of the archive. 426 * <b>declared package private for example documentation</b> 427 */ 428 final Map<String, Archive> archives = Collections 429 .synchronizedMap(new HashMap<String, Archive>()); 430 431 /** 432 * Receives {@linkplain Kick kicks} from {@linkplain Kicker} for this 433 * Harvester. <b>declared package private for example documentation</b> 434 */ 435 @Warning("non-API") 436 class MyKickReceiver implements KickReceiver { 437 438 /** 439 * Handles kick events from {@linkplain Kicker} 440 * 441 * @param kick 442 * some {@linkplain UpdateKick} 443 */ 444 public void handle(Kick kick) { 445 446 if (kick instanceof UpdateKick) { 447 final UpdateKick updateKick = (UpdateKick) kick; 448 449 if (!updateKick.archiveDesign().contains("Pipermail")) { 450 return; 451 } 452 453 final String archiveUrl = updateKick.archiveUrl(); 454 455 if (!archives.containsKey(archiveUrl)) { 456 archives.put(archiveUrl, new Archive(archiveUrl)); 457 } 458 459 UpdateContext context = new UpdateContext(updateKick); 460 context.run(); 461 } 462 } 463 } 464 465 /** 466 * Construct a new harvester for pipermail. 467 */ 468 public PipermailHarvester() { 469 hCache = HarvestCache.i(); 470 Kicker.i().register(new MyKickReceiver()); 471 aVerifier = new BasicAuthenticator(); 472 stateTable = new StateTable(hCache.getDatabase()); 473 try { 474 if (!stateTable.exists()) { 475 stateTable.create(); 476 } 477 } catch (SQLException e) { 478 LOGGER.log(Level.SEVERE, "State table initialization error.", e); 479 System.exit(1); 480 } 481 } 482 483 /** 484 * Provides update context. 485 */ 486 @Warning("non-API") 487 abstract class ContextFetcher extends AbstractFetcher { 488 protected final UpdateContext context; 489 490 public ContextFetcher(final UpdateContext context, final String path) { 491 super(context.archiveUrl(), path); 492 this.context = context; 493 } 494 495 @Override 496 public void fault(final String msg) { 497 context.archive.setFaulty(msg); 498 context.next(); 499 } 500 501 } 502 503 /** 504 * Context to run fetchers and synchronize state during an update. 505 * 506 */ 507 @Warning("non-API") 508 @ThreadSafe 509 class UpdateContext implements Runnable { 510 511 private final AtomicBoolean isStarted = new AtomicBoolean(false); 512 513 private final List<AbstractFetcher> queue = Collections 514 .synchronizedList(new LinkedList<AbstractFetcher>()); 515 516 public final Marker endMarker; 517 518 private volatile Marker startMarker; 519 520 private final Archive archive; 521 522 // kick objects 523 private final String archiveUrl; 524 private final PrintStream reportStream; 525 private final HarvestReporter reporter; 526 527 public UpdateContext(final UpdateKick kick) { 528 this.archiveUrl = kick.archiveUrl(); 529 this.archive = archives.get(archiveUrl); 530 this.reporter = kick.reporter(); 531 this.reportStream = kick.reporter().printStream(); 532 533 Marker tempEndMarker = null; 534 try { 535 // ignore possibly stale temporary entries 536 if (archive.firstRun.get()) { 537 tempEndMarker = stateTable.getNewest(archiveUrl, 538 StateTable.Type.PERM); 539 } else { 540 tempEndMarker = stateTable.getNewest(archiveUrl); 541 } 542 } catch (SQLException e) { 543 LOGGER.log( 544 Level.SEVERE, 545 "Cannot access state storage, further processing is useless.", 546 e); 547 System.exit(1); 548 } finally { 549 endMarker = tempEndMarker; 550 } 551 } 552 553 public synchronized String archiveUrl() { 554 return archiveUrl; 555 } 556 557 public synchronized void startUpdate(final Marker m) 558 throws SQLException { 559 if (isStarted.get()) { // already running. 560 return; 561 } 562 563 final String message = "Starting update to " 564 + (endMarker.path().isEmpty() ? "beginning of archive " 565 : endMarker.path()); 566 LOGGER.fine(message); 567 reportStream.println(message); 568 569 // other fetcher has grabbed start url already 570 if (!stateTable.put(archiveUrl, m) && !archive.firstRun.get()) { 571 finished(); 572 } 573 574 startMarker = m; 575 isStarted.set(true); 576 archive.firstRun.set(false); 577 } 578 579 public synchronized void run() { 580 AbstractFetcher fetcher; 581 if (new Date().after(archive.nextPeriodStart())) { 582 fetcher = new RootFetcher(this); 583 } else { 584 fetcher = new PeriodFetcher(archive.currentPeriod(), this); 585 } 586 runner.scheduleLast(fetcher); 587 } 588 589 public synchronized void failure(final String message) { 590 reportStream.println("Archive: " + archiveUrl + " failed with: " 591 + message); 592 queue.clear(); 593 reporter.proccessFinished(); 594 } 595 596 public synchronized void next() { 597 if (archive.isFaulty()) { 598 failure(archive.fault()); 599 return; 600 } 601 602 if (!queue.isEmpty()) { 603 Fetcher fetcher = queue.remove(0); 604 if (!isStarted.get()) { 605 reportStream.println("entering archive: " + fetcher.url()); 606 } else { 607 reportStream.println("harvesting: " + fetcher.url()); 608 } 609 runner.scheduleLast(fetcher); 610 } else { 611 finished(); 612 } 613 } 614 615 public synchronized void finished() { 616 if (isStarted.get()) { 617 LOGGER.fine("Finishing update: " + endMarker.path()); 618 try { 619 stateTable.finish(archiveUrl, startMarker, endMarker); 620 stateTable.update(archiveUrl); 621 } catch (SQLException e) { 622 LOGGER.log( 623 Level.SEVERE, 624 "Cannot save state, further processing will be useless.", 625 e); 626 System.exit(1); 627 } 628 } 629 queue.clear(); // allow garbage collection 630 reporter.proccessFinished(); 631 } 632 633 public synchronized Archive archive() { 634 return archive; 635 } 636 637 } 638 639 /** 640 * An index fetcher for the whole pipermail archive, like 641 * "http://mail.zelea.com/list/votorola/". Spawns new 642 * {@linkplain PeriodFetcher month fetchers}. <b>declared package private 643 * for example documentation</b> 644 * 645 * @see PeriodFetcher 646 */ 647 @Warning("non-API") 648 class RootFetcher extends ContextFetcher { 649 650 /** 651 * Carries the context along. 652 * 653 * @param context 654 */ 655 public RootFetcher(final UpdateContext context) { 656 super(context, ""); 657 } 658 659 public void run() { 660 // wait for encoding and locale to be parsed 661 // this only happens on the first run 662 if(context.archive.locale()==null 663 || context.archive.inputEncoding().isEmpty()) { 664 runner.scheduleLast(this); 665 return; 666 } 667 668 try { 669 // newest first 670 List<String> periods = indexPage(getInputStream(), PAT_PERIOD, 671 context.archive().inputEncoding()); 672 673 final String thisMonth = periods.get(0); 674 archives.get(archiveUrl()).setCurrentPeriod(thisMonth); 675 676 // newest last 677 Collections.reverse(periods); 678 for (final String period : periods) { 679 context.queue.add(0, new PeriodFetcher(period, context)); 680 } 681 } catch (Exception e) { 682 final String msg = "Could not fetch root element: " 683 + e.getMessage(); 684 LOGGER.log(Level.WARNING, msg, e); 685 context.archive.setFaulty(msg); 686 } 687 context.next(); 688 } 689 } 690 691 /** 692 * An index fetcher for one period like "2010-January/date.html" or 693 * "2012/date.html". Spawns new {@linkplain MessageFetcher fetchers} 694 * Fetching the "date.html" index allows us to schedule page fetches by 695 * PipermailHarvester.PAT_SENTDATE. * <b>declared package private for 696 * example documentation</b> 697 * 698 * @see RootFetcher 699 * @see MessageFetcher 700 */ 701 @Warning("non-API") 702 class PeriodFetcher extends ContextFetcher { 703 704 public PeriodFetcher(final String period, final UpdateContext context) { 705 super(context, period + "/date.html"); 706 this.period = period; 707 } 708 709 private final String period; 710 711 public void run() { 712 try { 713 // newest last 714 List<String> posts = indexPage(getInputStream(), PAT_POST, 715 context.archive().inputEncoding()); 716 // newest first 717 Collections.reverse(posts); 718 LinkedList<String> cleanList = new LinkedList<String>(); 719 720 synchronized (context) { 721 for (String post : posts) { 722 if (context.endMarker.path() 723 .equals(period + "/" + post)) { 724 // finished when we hit the marker 725 context.queue.clear(); // remove earlier months 726 break; 727 } 728 cleanList.add(post); 729 } 730 731 // newest last 732 Collections.reverse(cleanList); 733 for (String post : cleanList) { 734 context.queue.add(0, new MessageFetcher(context, period 735 + "/" + post)); 736 } 737 } 738 context.next(); 739 } catch (Exception e) { 740 final String msg = "Could not fetch period " + period + ": " 741 + e.getMessage(); 742 LOGGER.log(Level.WARNING, msg, e); 743 context.archive().setFaulty(msg); 744 } 745 } 746 } 747 748 /** 749 * A fetcher for a single page, like "2010-Jan/003882.html". Contains only 750 * one message in pipermail. <b>declared package private for example 751 * documentation</b> 752 * 753 * @see PeriodFetcher 754 */ 755 @Warning("non-API") 756 class MessageFetcher extends ContextFetcher { 757 758 /** 759 * Timezone is parsed in English separately, 760 * 761 * @see run() 762 */ 763 private final SimpleDateFormat dateFormat; 764 765 public MessageFetcher(final UpdateContext context, final String path) { 766 super(context, path); 767 final Locale locale = context.archive.locale(); 768 if (locale == null) { 769 context.failure("Locale is null"); 770 dateFormat = null; 771 return; 772 } 773 774 dateFormat = mailmanDateFormat(locale); 775 if (dateFormat == null) { 776 context.failure("Locale not supported: " + locale.getCountry()); 777 } 778 } 779 780 /** 781 * DateFormat adjustments taken from mailman 2.1.15 LC_MESSAGES 782 * 783 * @param locale 784 * @return dateformat or null if locale is not supported 785 */ 786 private SimpleDateFormat mailmanDateFormat(final Locale locale) { 787 final String format = "EEE MMM d HH:mm:ss yyyy"; 788 DateFormatSymbols dfsFr = new DateFormatSymbols(locale); 789 790 String[] months = null; 791 String[] weekDays = null; 792 final String code = locale.getLanguage(); 793 if (code.equals(new Locale("fr").getLanguage())) { 794 months = new String[] { "Jan", "Fév", "Mar", "Apr", "May", 795 "Juin", "Juil", "Aou", "Sep", "Oct", "Nov", "Déc" }; 796 // first needs to be empty for DateFormat, starting with sunday 797 weekDays = new String[] { "", "Dim", "Lun", "Mar", "Mer", 798 "Jeu", "Ven", "Sam" }; 799 } 800 801 if (code.equals(new Locale("es").getLanguage())) { 802 months = new String[] { "Ene", "Feb", "Mar", "Abr", "May", 803 "Jun", "Jul", "Ago", "Sep", "Oct", "Nov", "Dic" }; 804 weekDays = new String[] { "", "Dom", "Lun", "Mar", "Mie", 805 "Jue", "Vie", "Sab" }; 806 } 807 808 if (code.equals(new Locale("de").getLanguage())) { 809 months = new String[] { "Jan", "Feb", "Mär", "Apr", "Mai", 810 "Jun", "Jul", "Aug", "Sep", "Okt", "Nov", "Dez" }; 811 weekDays = new String[] { "", "So", "Mo", "Di", "Mi", "Do", 812 "Fr", "Sa" }; 813 } 814 815 // default? 816 if (code.equals(new Locale("en").getLanguage())) { 817 return new SimpleDateFormat(format, locale); 818 } 819 820 if (months == null || weekDays == null) { 821 return null; 822 } 823 824 dfsFr.setShortWeekdays(weekDays); 825 dfsFr.setShortMonths(months); 826 827 return new SimpleDateFormat(format, dfsFr); 828 } 829 830 public void run() { 831 832 final Archive archive = context.archive(); 833 try { 834 final BufferedReader in = new BufferedReader( 835 new InputStreamReader(getInputStream(), 836 archive.inputEncoding())); 837 String author = ""; 838 Date date = null; 839 StringBuilder bodyB = new StringBuilder(); 840 841 boolean inContent = false; 842 for (String line = in.readLine(); line != null; line = in 843 .readLine()) { 844 Matcher m = null; 845 if (inContent) { 846 m = PAT_CONTENT_END.matcher(line); 847 if (m.find()) { 848 inContent = false; 849 continue; 850 } 851 852 String clean = line.replaceAll("\\<.*?>", ""); 853 // remove escaped tags 854 clean = StringEscapeUtils.unescapeHtml4(clean); 855 clean = clean.replaceAll("\\<.*?>", ""); 856 857 if (clean.startsWith("___________________________")) { 858 inContent = false; 859 continue; 860 } 861 862 if (!clean.startsWith(">") 863 && !clean.startsWith(""") 864 && !clean.endsWith("wrote:") 865 && !clean.startsWith("-------")) { 866 bodyB.append(clean).append(" "); 867 } 868 869 continue; 870 } 871 872 // effectively ignores first line of content for detection 873 // this is intended. 874 m = PAT_CONTENT_START.matcher(line); 875 if (m.find()) { 876 inContent = true; 877 continue; 878 } 879 880 m = PAT_AUTHOR.matcher(line); 881 if (author.isEmpty() && m.find()) { 882 Matcher m2 = PAT_AUTHOR2.matcher(line); 883 if (!m2.find()) { // author on next line 884 line = in.readLine(); 885 } 886 Matcher m3 = PAT_AUTHOR2.matcher(line); 887 if (m3.find()) { 888 author = m3.group(1); 889 final String[] ATs = { "at", "en" /* spanish */}; 890 for (final String AT : ATs) { 891 author = author.replace(" " + AT + " ", "@"); 892 } 893 } 894 continue; 895 } 896 897 m = PAT_SENTDATE.matcher(line); 898 if (date == null && m.find()) { 899 // Timezone is always localized in English by Mailman. 900 // therefore we have to parse separately 901 final String tz = m.group(5); 902 903 // build date format in correct order 904 final StringBuilder dateStringB = new StringBuilder(); 905 final String weekDay = m.group(1); 906 dateStringB.append(weekDay); 907 908 // Find out where the day number is, second or third 909 // field... 910 Pattern NUM = Pattern.compile("\\d+"); 911 String month; 912 String dayOfMonth; 913 if (NUM.matcher(m.group(2)).find()) { 914 dayOfMonth = m.group(2); 915 month = m.group(3); 916 } else { 917 dayOfMonth = m.group(3); 918 month = m.group(2); 919 } 920 921 // reassemble 922 dateStringB.append(" ").append(month); 923 dateStringB.append(" ").append(dayOfMonth); 924 dateStringB.append(" ").append(m.group(4)); // time 925 dateStringB.append(" ").append(m.group(6)); // year 926 927 final String dateString = dateStringB.toString(); 928 dateFormat.setTimeZone(TimeZone.getTimeZone(tz)); 929 date = dateFormat.parse(dateString); 930 continue; 931 } 932 } 933 final String summary = HarvestUtil.summarize(bodyB.toString()); 934 final List<String> diffUrls = hCache.findDiffUrls(bodyB 935 .toString()); 936 LOGGER.finest("new message parsed: " + url()); 937 938 if (!diffUrls.isEmpty()) { 939 final Message msg = Message.create(summary, author, 940 archiveUrl(), path(), diffUrls, date); 941 LOGGER.fine("new diff message parsed: " + msg.toString()); 942 hCache.store(msg, aVerifier); 943 } 944 945 context.startUpdate(Marker.create(path(), date)); 946 947 } catch (ParseException e) { 948 final StringBuilder msgB = new StringBuilder() 949 .append("Could not parse date: "); 950 msgB.append(e.getMessage()); 951 DateFormatSymbols dfs = dateFormat.getDateFormatSymbols(); 952 msgB.append("\nLocalized week day strings: ").append( 953 Arrays.toString(dfs.getShortWeekdays())); 954 msgB.append("\nLocalized month strings: ").append( 955 Arrays.toString(dfs.getShortMonths())); 956 LOGGER.log(Level.WARNING, msgB.toString(), e); 957 archive.setFaulty(msgB.toString()); 958 } catch (SQLException e) { 959 final String msg = "Database problem: " + e.getMessage(); 960 LOGGER.log(Level.SEVERE, msg, e); 961 archive.setFaulty(msg); 962 } catch (IOException e) { // should never happen 963 final String msg = "Bug: IO-Problem while reading page buffer: " 964 + e.getMessage(); 965 LOGGER.log(Level.SEVERE, msg, e); 966 archive.setFaulty(msg); 967 } catch (Exception e) { 968 final String msg = "Bug: Something weird happened: " 969 + e.getMessage(); 970 LOGGER.log(Level.SEVERE, msg, e); 971 archive.setFaulty(msg); 972 } 973 974 context.next(); 975 } 976 } 977 978 /** 979 * Any index type fetcher that can be defined by a {@linkplain Pattern 980 * pattern} can use this routine. <b>declared package private for example 981 * documentation</b> Order is by match on stream. 982 */ 983 private List<String> indexPage(final InputStream is, final Pattern pat, 984 final String encoding) { 985 if (is == null) { 986 LOGGER.log(Level.WARNING, 987 "Tried to index a stream which is NULL. This is a bug!"); 988 return new LinkedList<String>(); 989 } 990 991 BufferedReader tempIn = null; 992 try { 993 tempIn = new BufferedReader(new InputStreamReader(is, encoding)); 994 } catch (UnsupportedEncodingException e) { 995 LOGGER.log(Level.WARNING, "Encoding '" + encoding 996 + "' is not supported. This is a faulty Java setup!", e); 997 return new LinkedList<String>(); 998 } 999 final BufferedReader in = tempIn; 1000 1001 LinkedList<String> hits = new LinkedList<String>(); 1002 try { 1003 for (String s = in.readLine(); s != null; s = in.readLine()) { 1004 Matcher m = pat.matcher(s); 1005 1006 if (m.find()) { 1007 hits.add(m.group(1)); 1008 continue; 1009 } 1010 } 1011 } catch (IOException e) { 1012 LOGGER.log(Level.WARNING, 1013 "Cannot read the HTML input stream. This is a bug.", e); 1014 } finally { 1015 try { 1016 in.close(); 1017 } catch (IOException e) { 1018 LOGGER.log(Level.WARNING, 1019 "Cannot class the HTML input stream. This is a bug.", e); 1020 } 1021 } 1022 1023 return hits; 1024 } 1025 1026}