Source code

001package votorola.a.diff.harvest; // Copyright 2012, Christian Weilbach.  Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Votorola Software"), to deal in the Votorola Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Votorola Software, and to permit persons to whom the Votorola Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Votorola Software. THE VOTOROLA SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE VOTOROLA SOFTWARE OR THE USE OR OTHER DEALINGS IN THE VOTOROLA SOFTWARE.
002
003import java.io.BufferedReader;
004import java.io.IOException;
005import java.io.InputStream;
006import java.io.InputStreamReader;
007import java.io.PrintStream;
008import java.io.UnsupportedEncodingException;
009
010import java.sql.SQLException;
011import java.text.DateFormatSymbols;
012import java.text.ParseException;
013import java.text.SimpleDateFormat;
014import java.util.Arrays;
015import java.util.Collections;
016import java.util.Date;
017import java.util.HashMap;
018import java.util.LinkedList;
019import java.util.List;
020import java.util.Locale;
021import java.util.Map;
022import java.util.TimeZone;
023import java.util.concurrent.atomic.AtomicBoolean;
024import java.util.logging.Level;
025import java.util.logging.Logger;
026import java.util.regex.Matcher;
027import java.util.regex.Pattern;
028
029import org.apache.commons.lang3.StringEscapeUtils;
030
031import com.ibm.icu.util.Calendar;
032import com.ibm.icu.util.GregorianCalendar;
033
034import votorola.a.diff.harvest.auth.BasicAuthenticator;
035import votorola.a.diff.harvest.cache.HarvestCache;
036import votorola.a.diff.harvest.kick.Kick;
037import votorola.a.diff.harvest.kick.KickReceiver;
038import votorola.a.diff.harvest.kick.Kicker;
039import votorola.a.diff.harvest.kick.UpdateKick;
040import votorola.a.diff.harvest.run.AbstractFetcher;
041import votorola.a.diff.harvest.run.Fetcher;
042import votorola.a.diff.harvest.run.HarvestRunner;
043import votorola.g.lang.ThreadSafe;
044import votorola.g.lang.Warning;
045import votorola.g.logging.LoggerX;
046
047/**
048 * <p>
049 * A harvester implementation for pipermail archives. Pipermail archives are
050 * generated by <a href="http://en.wikipedia.org/wiki/GNU_Mailman">mailman</a>.
051 * Harvested messages are stored in {@linkplain HarvestCache} and fetchers are
052 * scheduled with {@linkplain HarvestRunner}.
053 * </p>
054 * 
055 * <p>
056 * Basically we do a <a
057 * href="http://en.wikipedia.org/wiki/Depth-first_search">depth first search</a>
058 * sorted by date. Web-view on the remote archive for this harvester:
059 * </p>
060 * <br/>
061 * 
062 * <pre *>
063 * <b>Tree of linked pages.     Level      Example remote archive URLs</b>
064 *             r1              {@linkplain RootFetcher root}       <i>http://mail.zelea.com/list/votorola/</i>    
065 *            / \
066 *           /   \
067 *          /     \
068 *         /       \
069 *       i2        i3          {@linkplain PeriodFetcher index}      <i>2010-Jan/date.html</i>                      
070 *       / \       /|\
071 *      /   \     / | \
072 *     m4    m5 m6 m7 m8       {@linkplain MessageFetcher message}    <i>2010-Jan/003321.html</i>
073 * </pre>
074 * 
075 * <p>
076 * All fetchers depend on <b>one</b> HTTP request. Runtime steps for these
077 * {@linkplain AbstractFetcher Fetchfetchers}:
078 * <ol>
079 * <li>Create fetcher and {@linkplain HarvestRunner#scheduleLast(Fetcher)
080 * schedule} it</li>
081 * <li>Once {@linkplain HarvestRunner} handles the fetcher, it asynchronously
082 * fetches the HTTP remote page and runs the fetcher in its thread pool</li>
083 * <li>Having access to {@linkplain AbstractFetcher#getInputStream()} now, the
084 * fetcher parses the page</li>
085 * <li>r1, i2, i3 start fetchers for the index of URLs parsed out of the page</li>
086 * </ol>
087 * </p>
088 * <p>
089 * Result after r1: <b>i2-i3</b>, after i2: <b>m4-m5-i3</b> after i3:
090 * <b>m6-m7-m8</b>. <br/>
091 * The state is save by anonymously extending the last Messagefetcher like m5 or
092 * m8 appropriately.
093 * </p>
094 * <p>
095 * <b>Note:</b> This is only an example. You can submit your
096 * {@linkplain votorola.a.diff.harvest.run.Fetcher} differently and you can also
097 * save state differently. Usage of the {@linkplain HarvestRunner} is
098 * recommended for graceful I/O handling though.
099 * </p>
100 * 
101 * @see votorola.a.diff.harvest The communication diagram for details of the
102 *      overall harvesting concept.
103 */
104public class PipermailHarvester {
105
106    private final static Logger LOGGER = LoggerX.i(PipermailHarvester.class);
107
108    // / / / Harvesting services
109
110    private final HarvestCache hCache;
111
112    /**
113     * Since PipermailHarvester.PAT_AUTHOR allows us to parse the E-Mail out of
114     * the page, we can generate the MailishUsername directly and therefore use
115     * the verifier which just compares mailish usernames of the auhors of the
116     * difference with the author of the message.
117     */
118    private final BasicAuthenticator aVerifier;
119
120    /**
121     * Global singleton scheduler.
122     */
123    private final HarvestRunner runner = HarvestRunner.i();
124
125    // / / / Forum specific settings
126
127    // list-info page
128
129    /**
130     * Pattern for list-info page. (for setup)
131     */
132    public final static Pattern PAT_LISTINFO = Pattern
133            .compile("<a href=\"(.*)\">");
134
135    // root page
136
137    /**
138     * Find input encoding.
139     */
140    public final static Pattern PAT_INPUTENC = Pattern
141            .compile("text/html; charset=([a-z0-9\\-]+)");
142
143    /**
144     * Parse language.
145     */
146    public final static Pattern PAT_LANG = Pattern
147            .compile("<INPUT name=\"language\" type=\"HIDDEN\" value=\"(\\w+)\" >");
148
149    /**
150     * Pattern to parse sub-list of posts for each month in
151     * {@linkplain RootFetcher}. If period is in years: 2012/date.html ... in
152     * months: 2012-October/date.html TODO add/fix weekly pattern if used
153     * somewhere
154     */
155    public final static Pattern PAT_PERIOD = Pattern
156            .compile("=\"((\\d+)-?(\\w+)?)/date.html\"\\>\\[ \\S+ \\]");
157    /**
158     * Pattern to parse list of posts in this month list in
159     * {@linkplain PeriodFetcher}.
160     */
161    public final static Pattern PAT_POST = Pattern
162            .compile("=\"(\\S+\\.html)\"\\>");
163
164    /**
165     * Pattern to find author email in {@linkplain MessageFetcher}.
166     */
167    public final static Pattern PAT_AUTHOR = Pattern.compile("HREF=\"mailto");
168    /**
169     * Pattern to parse author email in {@linkplain MessageFetcher}.
170     */
171    public final static Pattern PAT_AUTHOR2 = Pattern.compile("TITLE.+\\>(.+)");
172    /**
173     * Pattern to parse sent date in {@linkplain MessageFetcher}.
174     */
175    public final static Pattern PAT_SENTDATE = Pattern
176            .compile("<I>(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)</I>");
177    // example with grouping: (Di )(Okt )(30 )(23:48:50 )(CET )(2012)
178    // or: (Jeu )(1 )(Nov )(12:00:20 )(UTC )(2012)
179    // needed to always parse English timezone in Messagefetcher
180    /**
181     * Pattern to find the start of the body of the message in
182     * {@linkplain MessageFetcher}.
183     */
184    public final static Pattern PAT_CONTENT_START = Pattern
185            .compile("\\<PRE\\>");
186    /**
187     * Pattern to find the end of the body of the message in
188     * {@linkplain MessageFetcher}.
189     */
190    public final static Pattern PAT_CONTENT_END = Pattern.compile("\\</PRE\\>");
191
192    private final StateTable stateTable;
193
194    /**
195     * Structure to save state of a single archive. <b>declared package private
196     * for example documentation</b>
197     */
198    @Warning("non-API")
199    @ThreadSafe
200    class Archive {
201
202        class SetupFetcher extends AbstractFetcher {
203            final Archive archive;
204
205            public SetupFetcher(final Archive archive) {
206                super(archive.archiveUrl(), "");
207                this.archive = archive;
208            }
209
210            @Override
211            public void run() {
212                BufferedReader in;
213                String tmpListInfo = "";
214                String enc = "";
215                try {
216                    // expect the first fetch to work for html parsing only with
217                    // iso-8859-1 encoding (?)
218                    in = new BufferedReader(new InputStreamReader(
219                            getInputStream(), "iso-8859-1"));
220
221                    for (String line = in.readLine(); line != null; line = in
222                            .readLine()) {
223                        Matcher m;
224                        m = PAT_INPUTENC.matcher(line);
225                        if (m.find()) {
226                            enc = m.group(1);
227                            archive.setInputEncoding(enc);
228                        }
229
230                        m = PAT_LISTINFO.matcher(line);
231                        if (m.find()) {
232                            tmpListInfo = m.group(1);
233                        }
234                    }
235                } catch (UnsupportedEncodingException e) {
236                    final String msg = "Bug: Could not parse list-info HTML with iso-8859-1.";
237                    LOGGER.severe(msg);
238                    archive.setFaulty(msg);
239                } catch (IOException e) {
240                    final String msg = "Bug: Could not get list-info HTML.";
241                    LOGGER.severe(msg);
242                    archive.setFaulty(msg);
243                }
244
245                if (enc.isEmpty() || tmpListInfo.isEmpty()) {
246                    final String msg = "Could parse setup for archive:"
247                            + (enc.isEmpty() ? " encoding not found" : "")
248                            + (tmpListInfo.isEmpty() ? ", list-info page not found"
249                                    : "");
250                    LOGGER.severe(msg);
251                    archive.setFaulty(msg);
252                    return;
253                }
254                archive.setInputEncoding(enc);
255                final String listInfo = tmpListInfo;
256
257                runner.scheduleFirst(new Fetcher() {
258
259                    @Override
260                    public void run() {
261                        final String enc = archive.inputEncoding();
262
263                        final String langCode = indexPage(inputStream,
264                                PAT_LANG, enc).get(0);
265                        final Locale locale = new Locale(langCode);
266
267                        archive.setLocale(locale);
268                    }
269
270                    @Override
271                    public void fault(final String msg) {
272                        archive.setFaulty(msg);
273                    }
274
275                    // / / Fetcher
276
277                    @Override
278                    public String archiveUrl() {
279                        return archiveUrl;
280                    }
281
282                    @Override
283                    public String url() {
284                        return listInfo;
285                    }
286
287                    private InputStream inputStream;
288
289                    @Override
290                    public void setInputStream(InputStream in) {
291                        this.inputStream = in;
292                    }
293
294                    @Override
295                    public void setStatusCode(int code) {
296                        if (code >= 400) {
297                            final String error = "Failed to fetch list-info page: "
298                                    + listInfo;
299                            LOGGER.warning(error);
300                            archive.setFaulty(error);
301                        }
302                    }
303
304                });
305            }
306
307            @Override
308            public void fault(final String msg) {
309                archive.setFaulty(msg);
310            }
311        }
312
313        private final String archiveUrl;
314
315        public Archive(final String archiveUrl) {
316            this.archiveUrl = archiveUrl;
317            runner.scheduleFirst(new SetupFetcher(this));
318        }
319
320        public synchronized String archiveUrl() {
321            return archiveUrl;
322        }
323
324        private volatile boolean isFaulty = false;
325
326        public synchronized boolean isFaulty() {
327            return isFaulty;
328        }
329
330        private volatile String faultMsg = "";
331
332        public synchronized String fault() {
333            return faultMsg;
334        }
335
336        public synchronized void setFaulty(final String msg) {
337            this.faultMsg = msg;
338            isFaulty = true;
339        }
340
341        private volatile Locale locale = null;
342
343        public synchronized void setLocale(final Locale locale) {
344            if (locale == null) {
345                final String msg = "Could not parse locale, is null.";
346                LOGGER.severe(msg);
347                setFaulty(msg);
348                return;
349            }
350            this.locale = locale;
351        }
352
353        private volatile String inputEncoding = "";
354
355        public synchronized void setInputEncoding(final String encoding) {
356            if (encoding.isEmpty()) {
357                final String msg = "Parsed encoding is empty.";
358                LOGGER.severe(msg);
359                setFaulty(msg);
360                return;
361            }
362            inputEncoding = encoding;
363        }
364
365        public synchronized String inputEncoding() {
366            return inputEncoding;
367        }
368
369        /**
370         * Whether the archive has been updated in this instance yet.
371         */
372        public final AtomicBoolean firstRun = new AtomicBoolean(true);
373
374        private volatile Date nextPeriodStart = new Date(0);
375
376        private volatile String currentPeriod = "";
377
378        /**
379         * This determines whether UpdateContext will lookup the current month
380         * directly or do an index update if the month has changed.
381         * 
382         * @param current
383         */
384        public synchronized void setCurrentPeriod(String current) {
385            this.currentPeriod = current;
386
387            String format = "yyyy";
388            final Matcher m = PAT_PERIOD.matcher(current);
389            boolean monthly = false;
390            if (m.find()) {
391                final String month = m.group(3);
392                if (!month.isEmpty()) {
393                    monthly = true;
394                    format += "-MMMM";
395                }
396            }
397
398            SimpleDateFormat sdf = new SimpleDateFormat(format, Locale.ENGLISH);
399            GregorianCalendar cal = new GregorianCalendar();
400            try {
401                cal.setTime(sdf.parse(current));
402            } catch (ParseException e) {
403                LOGGER.log(Level.WARNING, "Cannot parse date from: " + current,
404                        e);
405            }
406            cal.roll(monthly ? Calendar.MONTH : Calendar.YEAR, true);
407            this.nextPeriodStart = cal.getTime();
408        }
409
410        public synchronized Date nextPeriodStart() {
411            return nextPeriodStart;
412        }
413
414        public synchronized String currentPeriod() {
415            return currentPeriod;
416        }
417
418        public synchronized Locale locale() {
419            return locale;
420        }
421
422    }
423
424    /**
425     * Tracks all configured archives. Key is the base URL of the archive.
426     * <b>declared package private for example documentation</b>
427     */
428    final Map<String, Archive> archives = Collections
429            .synchronizedMap(new HashMap<String, Archive>());
430
431    /**
432     * Receives {@linkplain Kick kicks} from {@linkplain Kicker} for this
433     * Harvester. <b>declared package private for example documentation</b>
434     */
435    @Warning("non-API")
436    class MyKickReceiver implements KickReceiver {
437
438        /**
439         * Handles kick events from {@linkplain Kicker}
440         * 
441         * @param kick
442         *            some {@linkplain UpdateKick}
443         */
444        public void handle(Kick kick) {
445
446            if (kick instanceof UpdateKick) {
447                final UpdateKick updateKick = (UpdateKick) kick;
448
449                if (!updateKick.archiveDesign().contains("Pipermail")) {
450                    return;
451                }
452
453                final String archiveUrl = updateKick.archiveUrl();
454
455                if (!archives.containsKey(archiveUrl)) {
456                    archives.put(archiveUrl, new Archive(archiveUrl));
457                }
458
459                UpdateContext context = new UpdateContext(updateKick);
460                context.run();
461            }
462        }
463    }
464
465    /**
466     * Construct a new harvester for pipermail.
467     */
468    public PipermailHarvester() {
469        hCache = HarvestCache.i();
470        Kicker.i().register(new MyKickReceiver());
471        aVerifier = new BasicAuthenticator();
472        stateTable = new StateTable(hCache.getDatabase());
473        try {
474            if (!stateTable.exists()) {
475                stateTable.create();
476            }
477        } catch (SQLException e) {
478            LOGGER.log(Level.SEVERE, "State table initialization error.", e);
479            System.exit(1);
480        }
481    }
482
483    /**
484     * Provides update context.
485     */
486    @Warning("non-API")
487    abstract class ContextFetcher extends AbstractFetcher {
488        protected final UpdateContext context;
489
490        public ContextFetcher(final UpdateContext context, final String path) {
491            super(context.archiveUrl(), path);
492            this.context = context;
493        }
494
495        @Override
496        public void fault(final String msg) {
497            context.archive.setFaulty(msg);
498            context.next();
499        }
500
501    }
502
503    /**
504     * Context to run fetchers and synchronize state during an update.
505     * 
506     */
507    @Warning("non-API")
508    @ThreadSafe
509    class UpdateContext implements Runnable {
510
511        private final AtomicBoolean isStarted = new AtomicBoolean(false);
512
513        private final List<AbstractFetcher> queue = Collections
514                .synchronizedList(new LinkedList<AbstractFetcher>());
515
516        public final Marker endMarker;
517
518        private volatile Marker startMarker;
519
520        private final Archive archive;
521
522        // kick objects
523        private final String archiveUrl;
524        private final PrintStream reportStream;
525        private final HarvestReporter reporter;
526
527        public UpdateContext(final UpdateKick kick) {
528            this.archiveUrl = kick.archiveUrl();
529            this.archive = archives.get(archiveUrl);
530            this.reporter = kick.reporter();
531            this.reportStream = kick.reporter().printStream();
532
533            Marker tempEndMarker = null;
534            try {
535                // ignore possibly stale temporary entries
536                if (archive.firstRun.get()) {
537                    tempEndMarker = stateTable.getNewest(archiveUrl,
538                            StateTable.Type.PERM);
539                } else {
540                    tempEndMarker = stateTable.getNewest(archiveUrl);
541                }
542            } catch (SQLException e) {
543                LOGGER.log(
544                        Level.SEVERE,
545                        "Cannot access state storage, further processing is useless.",
546                        e);
547                System.exit(1);
548            } finally {
549                endMarker = tempEndMarker;
550            }
551        }
552
553        public synchronized String archiveUrl() {
554            return archiveUrl;
555        }
556
557        public synchronized void startUpdate(final Marker m)
558                throws SQLException {
559            if (isStarted.get()) { // already running.
560                return;
561            }
562
563            final String message = "Starting update to "
564                    + (endMarker.path().isEmpty() ? "beginning of archive "
565                            : endMarker.path());
566            LOGGER.fine(message);
567            reportStream.println(message);
568
569            // other fetcher has grabbed start url already
570            if (!stateTable.put(archiveUrl, m) && !archive.firstRun.get()) {
571                finished();
572            }
573
574            startMarker = m;
575            isStarted.set(true);
576            archive.firstRun.set(false);
577        }
578
579        public synchronized void run() {
580            AbstractFetcher fetcher;
581            if (new Date().after(archive.nextPeriodStart())) {
582                fetcher = new RootFetcher(this);
583            } else {
584                fetcher = new PeriodFetcher(archive.currentPeriod(), this);
585            }
586            runner.scheduleLast(fetcher);
587        }
588
589        public synchronized void failure(final String message) {
590            reportStream.println("Archive: " + archiveUrl + " failed with: "
591                    + message);
592            queue.clear();
593            reporter.proccessFinished();
594        }
595
596        public synchronized void next() {
597            if (archive.isFaulty()) {
598                failure(archive.fault());
599                return;
600            }
601
602            if (!queue.isEmpty()) {
603                Fetcher fetcher = queue.remove(0);
604                if (!isStarted.get()) {
605                    reportStream.println("entering archive: " + fetcher.url());
606                } else {
607                    reportStream.println("harvesting: " + fetcher.url());
608                }
609                runner.scheduleLast(fetcher);
610            } else {
611                finished();
612            }
613        }
614
615        public synchronized void finished() {
616            if (isStarted.get()) {
617                LOGGER.fine("Finishing update: " + endMarker.path());
618                try {
619                    stateTable.finish(archiveUrl, startMarker, endMarker);
620                    stateTable.update(archiveUrl);
621                } catch (SQLException e) {
622                    LOGGER.log(
623                            Level.SEVERE,
624                            "Cannot save state, further processing will be useless.",
625                            e);
626                    System.exit(1);
627                }
628            }
629            queue.clear(); // allow garbage collection
630            reporter.proccessFinished();
631        }
632
633        public synchronized Archive archive() {
634            return archive;
635        }
636
637    }
638
639    /**
640     * An index fetcher for the whole pipermail archive, like
641     * "http://mail.zelea.com/list/votorola/". Spawns new
642     * {@linkplain PeriodFetcher month fetchers}. <b>declared package private
643     * for example documentation</b>
644     * 
645     * @see PeriodFetcher
646     */
647    @Warning("non-API")
648    class RootFetcher extends ContextFetcher {
649
650        /**
651         * Carries the context along.
652         * 
653         * @param context
654         */
655        public RootFetcher(final UpdateContext context) {
656            super(context, "");
657        }
658
659        public void run() {
660            // wait for encoding and locale to be parsed
661            // this only happens on the first run
662            if(context.archive.locale()==null
663                    || context.archive.inputEncoding().isEmpty()) {
664                runner.scheduleLast(this);
665                return;
666            }
667            
668            try {
669                // newest first
670                List<String> periods = indexPage(getInputStream(), PAT_PERIOD,
671                        context.archive().inputEncoding());
672
673                final String thisMonth = periods.get(0);
674                archives.get(archiveUrl()).setCurrentPeriod(thisMonth);
675
676                // newest last
677                Collections.reverse(periods);
678                for (final String period : periods) {
679                    context.queue.add(0, new PeriodFetcher(period, context));
680                }
681            } catch (Exception e) {
682                final String msg = "Could not fetch root element: "
683                        + e.getMessage();
684                LOGGER.log(Level.WARNING, msg, e);
685                context.archive.setFaulty(msg);
686            }
687            context.next();
688        }
689    }
690
691    /**
692     * An index fetcher for one period like "2010-January/date.html" or
693     * "2012/date.html". Spawns new {@linkplain MessageFetcher fetchers}
694     * Fetching the "date.html" index allows us to schedule page fetches by
695     * PipermailHarvester.PAT_SENTDATE. * <b>declared package private for
696     * example documentation</b>
697     * 
698     * @see RootFetcher
699     * @see MessageFetcher
700     */
701    @Warning("non-API")
702    class PeriodFetcher extends ContextFetcher {
703
704        public PeriodFetcher(final String period, final UpdateContext context) {
705            super(context, period + "/date.html");
706            this.period = period;
707        }
708
709        private final String period;
710
711        public void run() {
712            try {
713                // newest last
714                List<String> posts = indexPage(getInputStream(), PAT_POST,
715                        context.archive().inputEncoding());
716                // newest first
717                Collections.reverse(posts);
718                LinkedList<String> cleanList = new LinkedList<String>();
719
720                synchronized (context) {
721                    for (String post : posts) {
722                        if (context.endMarker.path()
723                                .equals(period + "/" + post)) {
724                            // finished when we hit the marker
725                            context.queue.clear(); // remove earlier months
726                            break;
727                        }
728                        cleanList.add(post);
729                    }
730
731                    // newest last
732                    Collections.reverse(cleanList);
733                    for (String post : cleanList) {
734                        context.queue.add(0, new MessageFetcher(context, period
735                                + "/" + post));
736                    }
737                }
738                context.next();
739            } catch (Exception e) {
740                final String msg = "Could not fetch period " + period + ": "
741                        + e.getMessage();
742                LOGGER.log(Level.WARNING, msg, e);
743                context.archive().setFaulty(msg);
744            }
745        }
746    }
747
748    /**
749     * A fetcher for a single page, like "2010-Jan/003882.html". Contains only
750     * one message in pipermail. <b>declared package private for example
751     * documentation</b>
752     * 
753     * @see PeriodFetcher
754     */
755    @Warning("non-API")
756    class MessageFetcher extends ContextFetcher {
757
758        /**
759         * Timezone is parsed in English separately,
760         * 
761         * @see run()
762         */
763        private final SimpleDateFormat dateFormat;
764
765        public MessageFetcher(final UpdateContext context, final String path) {
766            super(context, path);
767            final Locale locale = context.archive.locale();
768            if (locale == null) {
769                context.failure("Locale is null");
770                dateFormat = null;
771                return;
772            }
773
774            dateFormat = mailmanDateFormat(locale);
775            if (dateFormat == null) {
776                context.failure("Locale not supported: " + locale.getCountry());
777            }
778        }
779
780        /**
781         * DateFormat adjustments taken from mailman 2.1.15 LC_MESSAGES
782         * 
783         * @param locale
784         * @return dateformat or null if locale is not supported
785         */
786        private SimpleDateFormat mailmanDateFormat(final Locale locale) {
787            final String format = "EEE MMM d HH:mm:ss yyyy";
788            DateFormatSymbols dfsFr = new DateFormatSymbols(locale);
789
790            String[] months = null;
791            String[] weekDays = null;
792            final String code = locale.getLanguage();
793            if (code.equals(new Locale("fr").getLanguage())) {
794                months = new String[] { "Jan", "Fév", "Mar", "Apr", "May",
795                        "Juin", "Juil", "Aou", "Sep", "Oct", "Nov", "Déc" };
796                // first needs to be empty for DateFormat, starting with sunday
797                weekDays = new String[] { "", "Dim", "Lun", "Mar", "Mer",
798                        "Jeu", "Ven", "Sam" };
799            }
800
801            if (code.equals(new Locale("es").getLanguage())) {
802                months = new String[] { "Ene", "Feb", "Mar", "Abr", "May",
803                        "Jun", "Jul", "Ago", "Sep", "Oct", "Nov", "Dic" };
804                weekDays = new String[] { "", "Dom", "Lun", "Mar", "Mie",
805                        "Jue", "Vie", "Sab" };
806            }
807
808            if (code.equals(new Locale("de").getLanguage())) {
809                months = new String[] { "Jan", "Feb", "Mär", "Apr", "Mai",
810                        "Jun", "Jul", "Aug", "Sep", "Okt", "Nov", "Dez" };
811                weekDays = new String[] { "", "So", "Mo", "Di", "Mi", "Do",
812                        "Fr", "Sa" };
813            }
814
815            // default?
816            if (code.equals(new Locale("en").getLanguage())) {
817                return new SimpleDateFormat(format, locale);
818            }
819
820            if (months == null || weekDays == null) {
821                return null;
822            }
823
824            dfsFr.setShortWeekdays(weekDays);
825            dfsFr.setShortMonths(months);
826
827            return new SimpleDateFormat(format, dfsFr);
828        }
829
830        public void run() {
831
832            final Archive archive = context.archive();
833            try {
834                final BufferedReader in = new BufferedReader(
835                        new InputStreamReader(getInputStream(),
836                                archive.inputEncoding()));
837                String author = "";
838                Date date = null;
839                StringBuilder bodyB = new StringBuilder();
840
841                boolean inContent = false;
842                for (String line = in.readLine(); line != null; line = in
843                        .readLine()) {
844                    Matcher m = null;
845                    if (inContent) {
846                        m = PAT_CONTENT_END.matcher(line);
847                        if (m.find()) {
848                            inContent = false;
849                            continue;
850                        }
851
852                        String clean = line.replaceAll("\\<.*?>", "");
853                        // remove escaped tags
854                        clean = StringEscapeUtils.unescapeHtml4(clean);
855                        clean = clean.replaceAll("\\<.*?>", "");
856
857                        if (clean.startsWith("___________________________")) {
858                            inContent = false;
859                            continue;
860                        }
861
862                        if (!clean.startsWith(">")
863                                && !clean.startsWith("&quot;")
864                                && !clean.endsWith("wrote:")
865                                && !clean.startsWith("-------")) {
866                            bodyB.append(clean).append(" ");
867                        }
868
869                        continue;
870                    }
871
872                    // effectively ignores first line of content for detection
873                    // this is intended.
874                    m = PAT_CONTENT_START.matcher(line);
875                    if (m.find()) {
876                        inContent = true;
877                        continue;
878                    }
879
880                    m = PAT_AUTHOR.matcher(line);
881                    if (author.isEmpty() && m.find()) {
882                        Matcher m2 = PAT_AUTHOR2.matcher(line);
883                        if (!m2.find()) { // author on next line
884                            line = in.readLine();
885                        }
886                        Matcher m3 = PAT_AUTHOR2.matcher(line);
887                        if (m3.find()) {
888                            author = m3.group(1);
889                            final String[] ATs = { "at", "en" /* spanish */};
890                            for (final String AT : ATs) {
891                                author = author.replace(" " + AT + " ", "@");
892                            }
893                        }
894                        continue;
895                    }
896
897                    m = PAT_SENTDATE.matcher(line);
898                    if (date == null && m.find()) {
899                        // Timezone is always localized in English by Mailman.
900                        // therefore we have to parse separately
901                        final String tz = m.group(5);
902
903                        // build date format in correct order
904                        final StringBuilder dateStringB = new StringBuilder();
905                        final String weekDay = m.group(1);
906                        dateStringB.append(weekDay);
907
908                        // Find out where the day number is, second or third
909                        // field...
910                        Pattern NUM = Pattern.compile("\\d+");
911                        String month;
912                        String dayOfMonth;
913                        if (NUM.matcher(m.group(2)).find()) {
914                            dayOfMonth = m.group(2);
915                            month = m.group(3);
916                        } else {
917                            dayOfMonth = m.group(3);
918                            month = m.group(2);
919                        }
920
921                        // reassemble
922                        dateStringB.append(" ").append(month);
923                        dateStringB.append(" ").append(dayOfMonth);
924                        dateStringB.append(" ").append(m.group(4)); // time
925                        dateStringB.append(" ").append(m.group(6)); // year
926
927                        final String dateString = dateStringB.toString();
928                        dateFormat.setTimeZone(TimeZone.getTimeZone(tz));
929                        date = dateFormat.parse(dateString);
930                        continue;
931                    }
932                }
933                final String summary = HarvestUtil.summarize(bodyB.toString());
934                final List<String> diffUrls = hCache.findDiffUrls(bodyB
935                        .toString());
936                LOGGER.finest("new message parsed: " + url());
937
938                if (!diffUrls.isEmpty()) {
939                    final Message msg = Message.create(summary, author,
940                            archiveUrl(), path(), diffUrls, date);
941                    LOGGER.fine("new diff message parsed: " + msg.toString());
942                    hCache.store(msg, aVerifier);
943                }
944
945                context.startUpdate(Marker.create(path(), date));
946
947            } catch (ParseException e) {
948                final StringBuilder msgB = new StringBuilder()
949                        .append("Could not parse date: ");
950                msgB.append(e.getMessage());
951                DateFormatSymbols dfs = dateFormat.getDateFormatSymbols();
952                msgB.append("\nLocalized week day strings: ").append(
953                        Arrays.toString(dfs.getShortWeekdays()));
954                msgB.append("\nLocalized month strings: ").append(
955                        Arrays.toString(dfs.getShortMonths()));
956                LOGGER.log(Level.WARNING, msgB.toString(), e);
957                archive.setFaulty(msgB.toString());
958            } catch (SQLException e) {
959                final String msg = "Database problem: " + e.getMessage();
960                LOGGER.log(Level.SEVERE, msg, e);
961                archive.setFaulty(msg);
962            } catch (IOException e) { // should never happen
963                final String msg = "Bug: IO-Problem while reading page buffer: "
964                        + e.getMessage();
965                LOGGER.log(Level.SEVERE, msg, e);
966                archive.setFaulty(msg);
967            } catch (Exception e) {
968                final String msg = "Bug: Something weird happened: "
969                        + e.getMessage();
970                LOGGER.log(Level.SEVERE, msg, e);
971                archive.setFaulty(msg);
972            }
973
974            context.next();
975        }
976    }
977
978    /**
979     * Any index type fetcher that can be defined by a {@linkplain Pattern
980     * pattern} can use this routine. <b>declared package private for example
981     * documentation</b> Order is by match on stream.
982     */
983    private List<String> indexPage(final InputStream is, final Pattern pat,
984            final String encoding) {
985        if (is == null) {
986            LOGGER.log(Level.WARNING,
987                    "Tried to index a stream which is NULL. This is a bug!");
988            return new LinkedList<String>();
989        }
990
991        BufferedReader tempIn = null;
992        try {
993            tempIn = new BufferedReader(new InputStreamReader(is, encoding));
994        } catch (UnsupportedEncodingException e) {
995            LOGGER.log(Level.WARNING, "Encoding '" + encoding
996                    + "' is not supported. This is a faulty Java setup!", e);
997            return new LinkedList<String>();
998        }
999        final BufferedReader in = tempIn;
1000
1001        LinkedList<String> hits = new LinkedList<String>();
1002        try {
1003            for (String s = in.readLine(); s != null; s = in.readLine()) {
1004                Matcher m = pat.matcher(s);
1005
1006                if (m.find()) {
1007                    hits.add(m.group(1));
1008                    continue;
1009                }
1010            }
1011        } catch (IOException e) {
1012            LOGGER.log(Level.WARNING,
1013                    "Cannot read the HTML input stream. This is a bug.", e);
1014        } finally {
1015            try {
1016                in.close();
1017            } catch (IOException e) {
1018                LOGGER.log(Level.WARNING,
1019                        "Cannot class the HTML input stream. This is a bug.", e);
1020            }
1021        }
1022
1023        return hits;
1024    }
1025
1026}