001package votorola.a.diff.harvest.cache; // Copyright 2010-2012. Christian Weilbach. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Votorola Software"), to deal in the Votorola Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Votorola Software, and to permit persons to whom the Votorola Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Votorola Software. THE VOTOROLA SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE VOTOROLA SOFTWARE OR THE USE OR OTHER DEALINGS IN THE VOTOROLA SOFTWARE. 002 003import java.io.IOException; 004import java.io.File; 005import java.net.URISyntaxException; 006import java.security.NoSuchAlgorithmException; 007import java.sql.SQLException; 008import java.util.Collections; 009import java.util.HashMap; 010import java.util.LinkedList; 011import java.util.List; 012import java.util.logging.Level; 013import java.util.logging.Logger; 014import java.util.regex.Matcher; 015import java.util.regex.Pattern; 016 017import javax.script.ScriptException; 018 019import votorola.a.VoteServer; 020import votorola.a.diff.DiffKeyParse; 021import votorola.a.diff.DraftPair; 022import votorola.a.diff.harvest.Message; 023import votorola.a.diff.harvest.MessageContext; 024import votorola.a.diff.harvest.auth.Authenticator; 025import votorola.a.diff.harvest.kick.Kicker; 026import votorola.a.diff.harvest.kick.UpdateKick; 027import votorola.a.count.Vote; 028import votorola.a.count.XCastRelation; 029import votorola.g.lang.ThreadSafe; 030import votorola.g.logging.LoggerX; 031import votorola.g.script.JavaScriptIncluder; 032import votorola.g.sql.Database; 033 034/** 035 * Pass your messages from the communication media the cache to find and store 036 * valid difference messages. The overall process of processing messages in 037 * regard to this cache is outlined here: 038 * 039 * <pre> 040 * 041 * {@linkplain HarvestCache#check(MessageContext, Authenticator) HarvestCache} {@linkplain HarvestCache#store(Message, Authenticator) HarvestCache} 042 * ^ ^ 043 * | | 044 * | {@linkplain MessageContext} | {@linkplain Message} 045 * | | 046 * | | 047 * Detectors Parse message from HTML (harvester) 048 * 049 * {@linkplain UpdateKick} (via {@linkplain Kicker}) 050 * 1. Message detection ------------> 2. Archive harvesting 051 * </pre> 052 * 053 * The steps of 1. and 2. are completely separated, since we have to parse the 054 * information from the web anyway to gain valid URLs for some media. If the Url 055 * to the message can be parsed in the detector no harvesting (web scraping) of 056 * the archive is necessary and Message objects can directly be used. In fact 057 * they both only differ in this very crucial information. 058 */ 059public @ThreadSafe 060class HarvestCache { 061 062 /** 063 * The cache needs access to a VoteServer.Run and must be initialized 064 * before usage. 065 */ 066 public static HarvestCache init(final VoteServer.Run vsRun) { 067 if (instance == null) { 068 synchronized (HarvestCache.class) { 069 try { 070 instance = new HarvestCache(vsRun); 071 } catch (Exception e) { 072 LOGGER.log(Level.SEVERE, 073 "Could not initialize HarvestCache.", e); 074 System.exit(1); 075 } 076 } 077 } 078 return instance; 079 } 080 081 /** 082 * Implemented as singleton. 083 */ 084 public static HarvestCache i() { 085 return instance; 086 } 087 088 private static HarvestCache instance; 089 090 /** 091 * Global logger object. 092 */ 093 private static final Logger LOGGER = LoggerX.i(HarvestCache.class); 094 095 /** 096 * Maximum length of the communication excerpt cut out of the message body. 097 */ 098 public static final int CHARLIMIT = 150; 099 100 /** 101 * This VoteServer.Run instance is needed to query authorship information 102 * from the VoteServer. 103 */ 104 private final VoteServer.Run vsRun; 105 106 private final DiffMessageTable table; 107 108 /** 109 * Database which contains the {@linkplain #diffMessageTable() Table} 110 */ 111 private final Database db; 112 113 /** 114 * Access the database of this HarvestCache instance. This can be used 115 * initialize other services with this db internally, so you only have to 116 * setup HarvestCache once and all services have access to the db. 117 * 118 * @return db 119 */ 120 public Database getDatabase() { 121 return db; 122 } 123 124 /** 125 * Patterns for difference bridge URLs. If those match links in the message 126 * body, these matched difference links are then checked for valid 127 * difference messages. 128 */ 129 private final List<Pattern> diffPatterns; 130 131 /** 132 * A context for configuring the construction of a {@linkplain VoteServer 133 * vote-server}. The construction is configured by the vote-server's 134 * {@linkplain VoteServer#startupConfigurationFile() startup configuration 135 * file}, which contains a script (s) for that purpose. During construction, 136 * an instance of this context (vsCC) is passed to s via 137 * s::constructingVoteServer(vsCC). 138 */ 139 public static @ThreadSafe 140 final class ConstructionContext { 141 142 /** 143 * Constructs the complete configuration of the harvest-cache. 144 * 145 * @param s 146 * the complete startup configuration script. 147 */ 148 private static ConstructionContext configure(final JavaScriptIncluder s) 149 throws ScriptException, URISyntaxException { 150 final ConstructionContext cc = new ConstructionContext(s); 151 s.invokeKnownFunction("constructingHarvestCache", cc); 152 return cc; 153 } 154 155 private ConstructionContext(final JavaScriptIncluder s) { 156 } 157 158 private final List<String> patternList = new LinkedList<String>(); 159 160 public List<String> getBridgePatternList() { 161 return patternList; 162 } 163 164 } 165 166 /** 167 * Construct a cache to which you pass your messages from the communication 168 * media. Communicates with a VoteServer 169 * 170 * @see votorola.a.VoteServer 171 * 172 * @param diffPatterns 173 * Patterns to match urls of requested difference bridges. 174 * @throws URISyntaxException 175 * VoteServer initialisation fails. 176 * @throws NoSuchAlgorithmException 177 * SHA hashing not available. This should never happen. 178 * @throws SQLException 179 * Problem while initializing the database connection. 180 * @throws ScriptException 181 * Cannot load configuration file (script) for VoteServer. 182 * @throws IOException 183 * VoteServer has some IO-Problem. 184 */ 185 private HarvestCache(final VoteServer.Run vsRun) throws URISyntaxException, 186 NoSuchAlgorithmException, IOException, ScriptException, 187 SQLException { 188 List<Pattern> tempDiffPatterns = new LinkedList<Pattern>(); 189 190 VoteServer v = vsRun.voteServer(); 191 final File configFile = new File(v.votorolaDirectory(), 192 "harvest-cache.js"); 193 if (configFile.exists()) { 194 final ConstructionContext cc = ConstructionContext 195 .configure(new JavaScriptIncluder(configFile)); 196 197 for (final String pat : cc.getBridgePatternList()) { 198 tempDiffPatterns.add(Pattern.compile(pat)); 199 LOGGER.finest("Added pattern for bridge: " + pat); 200 } 201 } else { 202 final String bridgeUrl = "http://" + v.serverName() + ":8080/" 203 + v.name() + "/w/D"; 204 LOGGER.info("BridgeURL(s) not configured, using default: " 205 + bridgeUrl); 206 tempDiffPatterns.add(Pattern.compile(Pattern.quote(bridgeUrl) 207 + "\\S+")); 208 } 209 210 this.diffPatterns = Collections.unmodifiableList(tempDiffPatterns); 211 this.vsRun = vsRun; 212 this.db = vsRun.database(); 213 this.table = new DiffMessageTable(db); 214 if(!table.exists()) { 215 table.create(); 216 } 217 }; 218 219 /** 220 * Return the VoteServer instance for this Cache. 221 * 222 */ 223 public VoteServer voteServer() { 224 return vsRun.voteServer(); 225 } 226 227 /** 228 * TODO Examine the message for difference-urls and return whether this 229 * message contains valid difference information. This includes search for 230 * difference-urls, comparing the message sender to author and candidate of 231 * the difference and verifying that the used forum is linked on the 232 * candidate's position page. Use this method in detectors to decide whether 233 * to raise a {@linkplain votorola.a.diff.harvest.kick.UpdateKick}. 234 * 235 * @param mc 236 * Context of the message to check. 237 * @param av 238 * Verifier for forum identities. 239 * @return List of DraftPairs encountered or an empty list. 240 */ 241 public boolean check(final MessageContext mc, final Authenticator av) { 242 return false; 243 } 244 245 /** 246 * Call this once you have a valid message including 247 * {@linkplain Message#path() url} to the web, to store it permanently in 248 * the database. 249 * 250 * This is the main public API call which processes messages: 251 * <ol> 252 * <li>Separate each {@linkplain DiffMessage} for each Url</li> 253 * <li>Authenticate message to {@linkplain AuthDiffMessage}</li> 254 * <li>TODO store relation in {@linkplain RelAuthDiffMessage}</li> 255 * <li>Store message with {@linkplain DiffMessageTable} 256 * </li> 257 * </ol> 258 * 259 * @param msg 260 * Message to process. 261 * @param av 262 * Verifier for this communication medium's identities. 263 * @return False if any of the steps failed. 264 */ 265 public boolean store(final Message msg, final Authenticator av) { 266 final List<DiffMessage> potMsgs = expandDiffMessages(msg); 267 if (potMsgs.isEmpty()) { 268 return false; 269 } 270 271 for (final DiffMessage dmsg : potMsgs) { 272 try { 273 final AuthDiffMessage authMsg = av.verify(dmsg); 274 if (authMsg == null) { 275 return false; 276 } 277 LOGGER.finest("Trying to store message: " + msg.mc().sentDate() 278 + " " + authMsg.author() + " " + authMsg.addressee()); 279 280 return put(authMsg); 281 } catch (Exception e) { 282 LOGGER.log(Level.WARNING, 283 "Could not verify message: " + msg.content() 284 + " with date: " + msg.mc().sentDate(), e); 285 return false; 286 } 287 } 288 return true; 289 } 290 291 /** 292 * Access this table to 293 * {@linkplain DiffMessageTable#get(String, String[], int)} messages. 294 */ 295 public DiffMessageTable getTable() { 296 return table; 297 } 298 299 /** 300 * Stores the DiffMessage finally into the database. 301 * 302 * @param dmsg 303 * Message to store in the database. 304 * @return False if a database error occurs. 305 */ 306 private boolean put(final AuthDiffMessage dmsg) { 307 try { 308 table.put(dmsg); 309 } catch (Exception e) { 310 LOGGER.log(Level.WARNING, "Putting message " 311 + dmsg.toString() 312 + " to DB failed. ", e); 313 return false; 314 } 315 return true; 316 } 317 318 /** 319 * Lists all interesting difference-urls. Interest is determined by the 320 * difference patterns configured for this cache. 321 * 322 * @param searchedString 323 * Message body or content to search. 324 * @return A list of matched difference URLs. 325 * 326 * @see #expandDiffMessages(Message) 327 * @see #diffPatterns 328 */ 329 public List<String> findDiffUrls(final String searchedString) { 330 LinkedList<String> urlList = new LinkedList<String>(); 331 for (Pattern diffPattern : diffPatterns) { 332 Matcher diffMatcher = diffPattern.matcher(searchedString); 333 while (diffMatcher.find()) { 334 String diffUrl = diffMatcher.group(); 335 urlList.add(diffUrl); 336 } 337 } 338 return urlList; 339 } 340 341 /** 342 * Generate a list of @see {@linkplain votorola.a.diff.harvest.cache.DiffMessage} 343 * by comparing all difference-urls of the message. 344 * 345 * @param msg 346 * @return A list of difference messages cloned from msg and decorated with 347 * the difference authorship information. 348 */ 349 private List<DiffMessage> expandDiffMessages(final Message msg) { 350 final List<DiffMessage> potMsgs = new LinkedList<DiffMessage>(); 351 for (String diffUrl : msg.mc().diffUrls()) { 352 for (Pattern diffPattern : diffPatterns) { 353 if (!diffPattern.matcher(diffUrl).find()) { 354 continue; 355 } 356 357 String[] paramStrings = new String[] { "a", "b", "aR", "bR" }; 358 HashMap<String, Integer> pMap = new HashMap<String, Integer>(); 359 for (String ps : paramStrings) { 360 Pattern pat = Pattern.compile(ps + "=(\\d+)"); 361 Matcher mat = pat.matcher(diffUrl); 362 pMap.put(ps, mat.find() ? Integer.parseInt(mat.group(1)) 363 : -1); 364 } 365 366 try { 367 DraftPair draftPair = DraftPair.newDraftPair( 368 new DiffKeyParse(pMap.get("a"), pMap.get("aR"), // CWFIX obsolete form 369 pMap.get("b"), pMap.get("bR")), 370 vsRun.voteServer().pollwiki()); 371 final DiffMessage potMsg = new DiffMessage(msg, draftPair); 372 373 potMsgs.add(potMsg); 374// } catch (MediaWiki.IDException e) { 375// LOGGER.log(Level.FINER, 376// "DraftPair revision for " + msg.path() + " failed."); 377// continue; 378// } catch (Exception e) { 379// LOGGER.log(Level.WARNING, 380// "Cannot expand DiffMessage, draftPair failed.", e); 381// continue; 382 } catch (IOException e) { // CWFIX review this change 383 LOGGER.log(Level.CONFIG, 384 "DraftPair construction for " + msg.path() + " failed.", e); 385 continue; 386 } 387 } 388 } 389 return potMsgs; 390 } 391 392 /** 393 * TODO was private, privatize again when vote history is available. Used by 394 * votorola/s/wap/HarvestWAP.java for now. 395 * 396 * @param author 397 * @param addressee 398 * @param pollName 399 * @return current relation between author and addressee 400 */ 401 public XCastRelation relation(final String author, 402 final String addressee, final String pollName) { 403 try { 404 final Vote authorVote = new Vote(author, vsRun.scopePoll() 405 .ensurePoll(pollName).voterInputTable()); 406 final Vote addresseeVote = new Vote(addressee, vsRun.scopePoll() 407 .ensurePoll(pollName).voterInputTable()); 408 if (addresseeVote.getCandidateEmail() != null 409 && addresseeVote.getCandidateEmail().equals(author)) { 410 return XCastRelation.CANDIDATE; 411 } 412 if (authorVote.getCandidateEmail() != null 413 && authorVote.getCandidateEmail().equals(addressee)) { 414 return XCastRelation.VOTER; 415 } 416 417 } catch (IOException | SQLException | ScriptException e) { 418 LOGGER.log(Level.WARNING, 419 "Cannot determine author/voter relationship.", e); 420 } 421 422 return XCastRelation.UNKNOWN; 423 } 424 425}