# Copyright (C) 2000-2001, Stefan Schwarzer # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # - Neither the name of the above author nor the names of the # contributors to the software may be used to endorse or promote # products derived from this software without specific prior written # permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ''' irclogmerge - read several IRC log files, merge them, and sort them by timestamp Usage: python irclogmerge.py [options] files Options: -f format assume format of certain program (available: ircii, mirc) -s split resulting file by months -o filename use this as filename or its base (if -s is given) -y year if year is not found in log, use this year (4 digits) Return codes: 0 no error 1 invalid command line 2 error during reading 3 error during writing ''' import os, sys, string import types, time, getopt, glob # used to use re but that's buggy in Python 2.1.1 import pre as re __version__ = '1.0' # ircII: # IRC log started Mon Nov 1 21:26 # # mIRC: # Session Start: Sun Feb 20 00:59:45 2000 # Session Close: Sun Feb 20 01:15:10 2000 # Bug: Since ircII doesn't write years in its log files, all ircII # sessions appear to have happened in the same year. #TODO auto-increment year if crossing year boundary _default_year = None def default_year(): 'Return default year value if not in logfile' global _default_year if not _default_year: # if not set on command line # set to current year _default_year = time.strftime( '%Y', time.localtime( time.time() ) ) return _default_year class Timestamp: 'Build a timestamp or return its value' _months = [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ] def __init__( self, year, month_string, day, hour, minute, second ): "Convert the given arguments to a distinct timestamp. " "Example: '1999', 'May', '4', '13', '9' -> '199905041309'" year, day, hour, minute, second = \ [ int(i) for i in (year, day, hour, minute, second) ] month = self._months.index(month_string) + 1 self.value = "%(year)04d%(month)02d%(day)02d" \ "%(hour)02d%(minute)02d%(second)02d" % vars() def __call__( self ): 'Return whole timestamp' return self.value def year( self ): 'Return year part of timestamp' return self.value[ :4 ] def month( self ): 'Return month part of timestamp' return self.value[ 4:6 ] class Session: 'Abstract Session base class' # The following methods/attributes have to be defined in derived classes: # session_start_pattern # sessions_split_pattern # set_timestamp( groups ) def __init__( self, session_string ): 'Generate a single session object from a string' if not session_string: return self._string = session_string match = self.session_start_pattern.match( session_string, 0, 80 ) self.set_timestamp( match.groups() ) def __call__( self ): 'Return the text of the session, including the start and end patterns' return self._string def extract_all( self, lines ): 'Return a list of session objects from the string(s) in lines' # turn sequences of strings into a single string if type(lines) != types.StringType: lines = ''.join( lines ) # split string into individual session strings session_list = self.sessions_split_pattern.findall( lines ) # map the strings to appropriate session objects session_list = [ self.__class__(session) for session in session_list ] return session_list def __str__( self ): return self._string class IrciiSession( Session ): 'Specialized ircII session class' session_start_pattern = re.compile( r'IRC log started \w+ (\w+)\s+(\d+)\s+(\d+):(\d+)' ) sessions_split_pattern = re.compile( r'^IRC log started \w+ \w+ .*?' r'(?=\012IRC log started \w+ \w+ |\Z)', re.M | re.S ) def set_timestamp( self, values ): 'Set the timestamp property of the session object' month, day, hour, minute = values self.timestamp = Timestamp( default_year(), month, day, hour, minute, '0' ) class MircSession( Session ): 'Specialized mIRC session class' session_start_pattern = re.compile( r'Session Start: \w+ (\w+) (\d+) (\d+):(\d+):(\d+) (\d+)' ) sessions_split_pattern = re.compile( r'^Session Start: \w+ \w+ .*?' r'^Session Close: \w+ \w+ .*?$', re.M | re.S ) def set_timestamp( self, values ): 'Set the timestamp property of the session object' month, day, hour, minute, second, year = values self.timestamp = Timestamp( year, month, day, hour, minute, second ) def output_sessions( session_list, output_name, split_flag ): 'Write sessions to file(s)' if split_flag: try: # split session logs month-wise previous_year_month = None for session in session_list: timestamp = session.timestamp year_month = timestamp.year() + timestamp.month() if year_month != previous_year_month: # condition not true for first cycle if previous_year_month != None: # close previously used file file.close() # open a new file for this year and month effective_name = output_name + year_month print 'Writing', effective_name file = open( effective_name, 'w' ) previous_year_month = year_month file.write( str(session) + '\n' ) # close last used file file.close() except IOError, msg: print "Error: can't write", effective_name, "(%s)" % msg sys.exit( 3 ) else: # no splitting, - use output_name literally try: print 'Writing', output_name file = open( output_name, 'w' ) for session in session_list: file.write( str(session) + '\n' ) file.close() except IOError, msg: print "Error: can't write", output_name, "(%s)" % msg sys.exit( 3 ) def main(): 'Read the files, merge, and sort them' print 'This is irclogmerge', __version__ print 'Copyright (C) 2000-2001, Stefan Schwarzer' # defaults Session, split_flag, output_name = IrciiSession, 0, 'IrcLog' try: opts, patterns = getopt.getopt( sys.argv[1:], 'hf:so:y:' ) except getopt.error, msg: print 'Error:', msg print __doc__ sys.exit( 1 ) # support globbing filenames = [] for pattern in patterns: filenames = filenames + glob.glob( pattern ) global _default_year for opt, val in opts: if opt == '-h': print __doc__ sys.exit( 1 ) if opt == '-f': # determine the appropriate class from the arg value Session = eval( val.lower().capitalize() + 'Session' ) if opt == '-s': split_flag = 1 if opt == '-o': output_name = val if opt == '-y': _default_year = val sessions = [] for filename in filenames: try: print 'Reading', filename file = open( filename, 'r' ) lines = file.readlines() file.close() # turn whole file into a list of Session objects and add it sessions = sessions + Session('').extract_all( lines ) except IOError, msg: print "Warning: can't read", filename, "(%s)" % msg if not sessions: print 'Error: no IRC sessions available' sys.exit( 2 ) print 'Sorting sessions' sessions.sort( lambda a, b: cmp( a.timestamp(), b.timestamp() ) ) output_sessions( sessions, output_name, split_flag ) if __name__ == '__main__': main()