001package votorola.a.diff.harvest.cache; // Copyright 2010-2012. Christian Weilbach.  Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Votorola Software"), to deal in the Votorola Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Votorola Software, and to permit persons to whom the Votorola Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Votorola Software. THE VOTOROLA SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE VOTOROLA SOFTWARE OR THE USE OR OTHER DEALINGS IN THE VOTOROLA SOFTWARE.
002
003import java.io.IOException;
004import java.io.File;
005import java.net.URISyntaxException;
006import java.security.NoSuchAlgorithmException;
007import java.sql.SQLException;
008import java.util.Collections;
009import java.util.HashMap;
010import java.util.LinkedList;
011import java.util.List;
012import java.util.logging.Level;
013import java.util.logging.Logger;
014import java.util.regex.Matcher;
015import java.util.regex.Pattern;
016
017import javax.script.ScriptException;
018
019import votorola.a.VoteServer;
020import votorola.a.diff.DiffKeyParse;
021import votorola.a.diff.DraftPair;
022import votorola.a.diff.harvest.Message;
023import votorola.a.diff.harvest.MessageContext;
024import votorola.a.diff.harvest.auth.Authenticator;
025import votorola.a.diff.harvest.kick.Kicker;
026import votorola.a.diff.harvest.kick.UpdateKick;
027import votorola.a.count.Vote;
028import votorola.a.count.XCastRelation;
029import votorola.g.lang.ThreadSafe;
030import votorola.g.logging.LoggerX;
031import votorola.g.script.JavaScriptIncluder;
032import votorola.g.sql.Database;
033
034/**
035 * Pass your messages from the communication media the cache to find and store
036 * valid difference messages. The overall process of processing messages in
037 * regard to this cache is outlined here:
038 * 
039 * <pre>
040 * 
041 *     {@linkplain HarvestCache#check(MessageContext, Authenticator) HarvestCache}                      {@linkplain HarvestCache#store(Message, Authenticator) HarvestCache}
042 *         ^                                    ^
043 *         |                                    |
044 *         | {@linkplain MessageContext}                     | {@linkplain Message}
045 *         |                                    |
046 *         |                                    |
047 *     Detectors                       Parse message from HTML (harvester)
048 *     
049 *                        {@linkplain UpdateKick} (via {@linkplain Kicker})
050 *  1. Message detection ------------> 2. Archive harvesting
051 * </pre>
052 * 
053 * The steps of 1. and 2. are completely separated, since we have to parse the
054 * information from the web anyway to gain valid URLs for some media. If the Url
055 * to the message can be parsed in the detector no harvesting (web scraping) of
056 * the archive is necessary and Message objects can directly be used. In fact
057 * they both only differ in this very crucial information.
058 */
059public @ThreadSafe
060class HarvestCache {
061
062    /**
063     * The cache needs access to a VoteServer.Run and must be initialized
064     * before usage.
065     */
066    public static HarvestCache init(final VoteServer.Run vsRun) {
067        if (instance == null) {
068            synchronized (HarvestCache.class) {
069                try {
070                    instance = new HarvestCache(vsRun);
071                } catch (Exception e) {
072                    LOGGER.log(Level.SEVERE,
073                            "Could not initialize HarvestCache.", e);
074                    System.exit(1);
075                }
076            }
077        }
078        return instance;
079    }
080
081    /**
082     * Implemented as singleton.
083     */
084    public static HarvestCache i() {
085        return instance;
086    }
087
088    private static HarvestCache instance;
089
090    /**
091     * Global logger object.
092     */
093    private static final Logger LOGGER = LoggerX.i(HarvestCache.class);
094
095    /**
096     * Maximum length of the communication excerpt cut out of the message body.
097     */
098    public static final int CHARLIMIT = 150;
099
100    /**
101     * This VoteServer.Run instance is needed to query authorship information
102     * from the VoteServer.
103     */
104    private final VoteServer.Run vsRun;
105
106    private final DiffMessageTable table;
107
108    /**
109     * Database which contains the {@linkplain #diffMessageTable() Table}
110     */
111    private final Database db;
112
113    /**
114     * Access the database of this HarvestCache instance. This can be used
115     * initialize other services with this db internally, so you only have to
116     * setup HarvestCache once and all services have access to the db.
117     * 
118     * @return db
119     */
120    public Database getDatabase() {
121        return db;
122    }
123
124    /**
125     * Patterns for difference bridge URLs. If those match links in the message
126     * body, these matched difference links are then checked for valid
127     * difference messages.
128     */
129    private final List<Pattern> diffPatterns;
130
131    /**
132     * A context for configuring the construction of a {@linkplain VoteServer
133     * vote-server}. The construction is configured by the vote-server's
134     * {@linkplain VoteServer#startupConfigurationFile() startup configuration
135     * file}, which contains a script (s) for that purpose. During construction,
136     * an instance of this context (vsCC) is passed to s via
137     * s::constructingVoteServer(vsCC).
138     */
139    public static @ThreadSafe
140    final class ConstructionContext {
141
142        /**
143         * Constructs the complete configuration of the harvest-cache.
144         * 
145         * @param s
146         *            the complete startup configuration script.
147         */
148        private static ConstructionContext configure(final JavaScriptIncluder s)
149                throws ScriptException, URISyntaxException {
150            final ConstructionContext cc = new ConstructionContext(s);
151            s.invokeKnownFunction("constructingHarvestCache", cc);
152            return cc;
153        }
154
155        private ConstructionContext(final JavaScriptIncluder s) {
156        }
157
158        private final List<String> patternList = new LinkedList<String>();
159
160        public List<String> getBridgePatternList() {
161            return patternList;
162        }
163
164    }
165
166    /**
167     * Construct a cache to which you pass your messages from the communication
168     * media. Communicates with a VoteServer
169     * 
170     * @see votorola.a.VoteServer
171     * 
172     * @param diffPatterns
173     *            Patterns to match urls of requested difference bridges.
174     * @throws URISyntaxException
175     *             VoteServer initialisation fails.
176     * @throws NoSuchAlgorithmException
177     *             SHA hashing not available. This should never happen.
178     * @throws SQLException
179     *             Problem while initializing the database connection.
180     * @throws ScriptException
181     *             Cannot load configuration file (script) for VoteServer.
182     * @throws IOException
183     *             VoteServer has some IO-Problem.
184     */
185    private HarvestCache(final VoteServer.Run vsRun) throws URISyntaxException,
186            NoSuchAlgorithmException, IOException, ScriptException,
187            SQLException {
188        List<Pattern> tempDiffPatterns = new LinkedList<Pattern>();
189
190        VoteServer v = vsRun.voteServer();
191        final File configFile = new File(v.votorolaDirectory(),
192                "harvest-cache.js");
193        if (configFile.exists()) {
194            final ConstructionContext cc = ConstructionContext
195                    .configure(new JavaScriptIncluder(configFile));
196
197            for (final String pat : cc.getBridgePatternList()) {
198                tempDiffPatterns.add(Pattern.compile(pat));
199                LOGGER.finest("Added pattern for bridge: " + pat);
200            }
201        } else {
202            final String bridgeUrl = "http://" + v.serverName() + ":8080/"
203                    + v.name() + "/w/D";
204            LOGGER.info("BridgeURL(s) not configured, using default: "
205                    + bridgeUrl);
206            tempDiffPatterns.add(Pattern.compile(Pattern.quote(bridgeUrl)
207                    + "\\S+"));
208        }
209
210        this.diffPatterns = Collections.unmodifiableList(tempDiffPatterns);
211        this.vsRun = vsRun;
212        this.db = vsRun.database();
213        this.table = new DiffMessageTable(db);
214        if(!table.exists()) {
215            table.create();
216        }
217    };
218
219    /**
220     * Return the VoteServer instance for this Cache.
221     * 
222     */
223    public VoteServer voteServer() {
224        return vsRun.voteServer();
225    }
226
227    /**
228     * TODO Examine the message for difference-urls and return whether this
229     * message contains valid difference information. This includes search for
230     * difference-urls, comparing the message sender to author and candidate of
231     * the difference and verifying that the used forum is linked on the
232     * candidate's position page. Use this method in detectors to decide whether
233     * to raise a {@linkplain votorola.a.diff.harvest.kick.UpdateKick}.
234     * 
235     * @param mc
236     *            Context of the message to check.
237     * @param av
238     *            Verifier for forum identities.
239     * @return List of DraftPairs encountered or an empty list.
240     */
241    public boolean check(final MessageContext mc, final Authenticator av) {
242        return false;
243    }
244
245    /**
246     * Call this once you have a valid message including
247     * {@linkplain Message#path() url} to the web, to store it permanently in
248     * the database.
249     * 
250     * This is the main public API call which processes messages:
251     * <ol>
252     * <li>Separate each {@linkplain DiffMessage} for each Url</li>
253     * <li>Authenticate message to {@linkplain AuthDiffMessage}</li>
254     * <li>TODO store relation in {@linkplain RelAuthDiffMessage}</li>
255     * <li>Store message with {@linkplain DiffMessageTable}
256     * </li>
257     * </ol>
258     * 
259     * @param msg
260     *            Message to process.
261     * @param av
262     *            Verifier for this communication medium's identities.
263     * @return False if any of the steps failed.
264     */
265    public boolean store(final Message msg, final Authenticator av) {
266        final List<DiffMessage> potMsgs = expandDiffMessages(msg);
267        if (potMsgs.isEmpty()) {
268            return false;
269        }
270
271        for (final DiffMessage dmsg : potMsgs) {
272            try {
273                final AuthDiffMessage authMsg = av.verify(dmsg);
274                if (authMsg == null) {
275                    return false;
276                }
277                LOGGER.finest("Trying to store message: " + msg.mc().sentDate()
278                        + " " + authMsg.author() + " " + authMsg.addressee());
279
280                return put(authMsg);
281            } catch (Exception e) {
282                LOGGER.log(Level.WARNING,
283                        "Could not verify message: " + msg.content()
284                                + " with date: " + msg.mc().sentDate(), e);
285                return false;
286            }
287        }
288        return true;
289    }
290
291    /**
292     * Access this table to
293     * {@linkplain DiffMessageTable#get(String, String[], int)} messages.
294     */
295    public DiffMessageTable getTable() {
296        return table;
297    }
298
299    /**
300     * Stores the DiffMessage finally into the database.
301     * 
302     * @param dmsg
303     *            Message to store in the database.
304     * @return False if a database error occurs.
305     */
306    private boolean put(final AuthDiffMessage dmsg) {
307        try {
308            table.put(dmsg);
309        } catch (Exception e) {
310            LOGGER.log(Level.WARNING, "Putting message "
311                    + dmsg.toString()
312                    + " to DB failed. ", e);
313            return false;
314        }
315        return true;
316    }
317
318    /**
319     * Lists all interesting difference-urls. Interest is determined by the
320     * difference patterns configured for this cache.
321     * 
322     * @param searchedString
323     *            Message body or content to search.
324     * @return A list of matched difference URLs.
325     * 
326     * @see #expandDiffMessages(Message)
327     * @see #diffPatterns
328     */
329    public List<String> findDiffUrls(final String searchedString) {
330        LinkedList<String> urlList = new LinkedList<String>();
331        for (Pattern diffPattern : diffPatterns) {
332            Matcher diffMatcher = diffPattern.matcher(searchedString);
333            while (diffMatcher.find()) {
334                String diffUrl = diffMatcher.group();
335                urlList.add(diffUrl);
336            }
337        }
338        return urlList;
339    }
340
341    /**
342     * Generate a list of @see {@linkplain votorola.a.diff.harvest.cache.DiffMessage}
343     * by comparing all difference-urls of the message.
344     * 
345     * @param msg
346     * @return A list of difference messages cloned from msg and decorated with
347     *         the difference authorship information.
348     */
349    private List<DiffMessage> expandDiffMessages(final Message msg) {
350        final List<DiffMessage> potMsgs = new LinkedList<DiffMessage>();
351        for (String diffUrl : msg.mc().diffUrls()) {
352            for (Pattern diffPattern : diffPatterns) {
353                if (!diffPattern.matcher(diffUrl).find()) {
354                    continue;
355                }
356
357                String[] paramStrings = new String[] { "a", "b", "aR", "bR" };
358                HashMap<String, Integer> pMap = new HashMap<String, Integer>();
359                for (String ps : paramStrings) {
360                    Pattern pat = Pattern.compile(ps + "=(\\d+)");
361                    Matcher mat = pat.matcher(diffUrl);
362                    pMap.put(ps, mat.find() ? Integer.parseInt(mat.group(1))
363                            : -1);
364                }
365
366                try {
367                    DraftPair draftPair = DraftPair.newDraftPair(
368                            new DiffKeyParse(pMap.get("a"), pMap.get("aR"), // CWFIX obsolete form
369                            pMap.get("b"), pMap.get("bR")),
370                            vsRun.voteServer().pollwiki());
371                    final DiffMessage potMsg = new DiffMessage(msg, draftPair);
372
373                    potMsgs.add(potMsg);
374//              } catch (MediaWiki.IDException e) {
375//                  LOGGER.log(Level.FINER,
376//                          "DraftPair revision for " + msg.path() + " failed.");
377//                  continue;
378//              } catch (Exception e) {
379//                  LOGGER.log(Level.WARNING,
380//                          "Cannot expand DiffMessage, draftPair failed.", e);
381//                  continue;
382                } catch (IOException e) { // CWFIX review this change
383                    LOGGER.log(Level.CONFIG,
384                            "DraftPair construction for " + msg.path() + " failed.", e);
385                    continue;
386                }
387            }
388        }
389        return potMsgs;
390    }
391
392    /**
393     * TODO was private, privatize again when vote history is available. Used by
394     * votorola/s/wap/HarvestWAP.java for now.
395     * 
396     * @param author
397     * @param addressee
398     * @param pollName
399     * @return current relation between author and addressee
400     */
401    public XCastRelation relation(final String author,
402            final String addressee, final String pollName) {
403        try {
404            final Vote authorVote = new Vote(author, vsRun.scopePoll()
405                    .ensurePoll(pollName).voterInputTable());
406            final Vote addresseeVote = new Vote(addressee, vsRun.scopePoll()
407                    .ensurePoll(pollName).voterInputTable());
408            if (addresseeVote.getCandidateEmail() != null
409                    && addresseeVote.getCandidateEmail().equals(author)) {
410                return XCastRelation.CANDIDATE;
411            }
412            if (authorVote.getCandidateEmail() != null
413                    && authorVote.getCandidateEmail().equals(addressee)) {
414                return XCastRelation.VOTER;
415            }
416
417        } catch (IOException | SQLException | ScriptException e) {
418            LOGGER.log(Level.WARNING,
419                    "Cannot determine author/voter relationship.", e);
420        }
421
422        return XCastRelation.UNKNOWN;
423    }
424
425}