#!/usr/bin/env python
"""
Usage: python txt2wav.py x.txt
Creates x.wav

Takes a while. Depends on how fast your computer is.
To produce the txt file from openoffice, i just copy and paste the whole contents to a txt editor.
Use VLC or Audacity to convert to MP3.
Remember to embed art in the mp3 so it looks nice in an iTunes podcast feed and elsewhere.

To do:
- automatically insert music wav files at least beginning and end
import openoffice odt: http://www.linuxjournal.com/article/9347
encode to mp3: http://pymedia.org/
? instead of encoding to mp3 using VLC can maybe just get a picture and put that and the wav in movie maker and upload that to youtube -- oh but you still want the mp3 for mp3 players
"""

__author__ = 'Patrick Roberts'
__copyright__ = 'Copyright 2009 Patrick Roberts'
__license__ = 'Python'
__version__ = '1.0'


import calendar, os, re, sys, time
import win32com.client
from itertools import *


dict_paths = [r'm:\dev\words', r'f:\dev\words'] # optional paths to a simple dictionary file, one word per line


def change_file_ext(path, ext):
    return ''.join([os.path.splitext(path)[0], os.extsep, ext])


try:
    # ideally, each word would get a pattern in the engine's universe!!! that would be a lot of patterns, & i can lookup words & forget them when no longer needed
    #words = set(ipipe(file(r'F:\dev\words'), str.strip, str.lower)) # proper nouns are capitalized in the words file
    dict_path = ifilter(os.path.isfile, dict_paths).next()
    words = set(imap(str.lower, ifilterfalse(lambda w: w.isupper(), imap(str.strip, file(dict_path))))) # filter out words that were all upper case, like UK
    words.difference_update('nc eeg fl ne se sw ar sd ai ga de st mrs er ers bae ba roi ide wi ri sus ak wa ro ug un ka lu mr eu sla ce mo sia si pa ca dram sc'.split()) # for some reason 'ROI' (acronym) was in the unix dict; 'ides' is a word, but not 'ide'; 'ak' is in dict as abbreviation of alaska, but it's not right for 'AK-47' since then AT&T expands it iself as 'alaska'
    words.update('cd tv unix nasa com org gov biz fortran darpa knoppix ajax nasdaq captcha db philadelphia corba tex rom cafe'.split())
        # be nice if the engine had a dictionary, well, word patterns, that would include pronunciation information, & whether it's an abbreviation
        # 'cd' is needed else it mispronounces 'CDs' as 'C Ds', with the 's' pronounced on its own for some reason
except:
    words = set()

def is_word(s):
    # - this goofed 'oss' because 'os' is a word but 'oss' is obviously not its plural. better to have a dictionary that knows the plurals too so this simple assumptions about plurals wouldn't be needed # - fails for terms like 'ISS'
    s = s.lower()
    return s in words or (s.endswith('s') and s[:-1] in words)


def search(state, get_score, get_next_states, time_limit=1, fringe_limit=100, persist=None, key=None, cache={}): #persist=t
    """Eventually, the engine should be doing this!!! effectors would replace get_next_state().
    nonlinear optimization is much more efficient when you know the function you're trying to optimize, but for any interesting problem, you don't know the function.
    When get_next_state always returns 1 state (& the same state) then this is really just deciding when to stop. & when is that? The length of time to put up with lack of progress should be proportional to, in order, what: how bad the best answer is, whether the recent states are getting worse
    persist is the number of times it will tolerate a state getting worse
    """
    try:
        #trace('%r in cache: %r' % (key, cache.get(key)))
        assert key is not None
        return cache[key] # ? use find() so the universe can handle forgetting; if found, update last_used; bad to call find() from another thread, but still better than having a redundant lossy cache; a disadvantage is that a cached value wouldn't be available until the engine runs a bit
    except:# KeyError:
        def safe_get_score(state):
            try:
                return get_score(state)
            except:
                log.exception()
                return 0
        while len(cache) > 1000: # - could be removed if you let the engine's forgetfulness do it
            cache.popitem()  # preferable to pop old/unused items, but the engine already does that, so better to speed the engine so it can do this
        started_at = time.time()
        states = [(safe_get_score(state), state)] #[dict(score=safe_get_score(state), state=state, got_successors=False)]
        # - for now, just ignore getting this working for font maximizing too, & have it just work for pronouncing acronyms & for layout
        best_score = None
        best_state = state
        while states and (not time_limit or time.time() - started_at < time_limit):
            score, state = states.pop()
            #if score is None or (len(states) > persist and none(imap(operator.itemgetter(1), states[-persist:]), partial(operator.is_, best_state)) and all(imap(operator.itemgetter(0), states[-persist:]), partial(operator.ge, best_score))):
                #break # would need a recent state list
            if score > best_score:
                best_score = score
                best_state = state
            #print best_score, score, state#s#[-1]#, best_score, best_state
            try:
                successors = get_next_states(state)#states[-1][1])# if len(states) == 1 or states[-1][1] is not best_state else states[-2][1]), best_state)
            except:
                log.exception()
                successors = None
            #if next == states[-1]: # practically identical state objects might not be comparable; eg, the state may contain a font instance
            if successors == None:
                break
            #scored_successors = sorted((safe_get_score(state), state) for state in successors)
            #if scored_successors and scored_successors[-1][0] > best_score:
            #    best_score, best_state = scored_successors[-1]
            #states = (states + scored_successors)[-fringe_limit:]
            # - this isn't remembering that a state has no successors, so it's recalling get_next_states; i think in my old search, it would have popped the state & only kept its successors (the fringe); i think the distinction missing is whether a state is an end point or not, whether there are end points at all; i think in the old textbook search(), it assumed that the successors of a state must be better than it, so the original state wouldn't be kept, & so any state you had, you hadn't looked up the succcors yet; maybe keep a dict mapping state ids (since they may not be hashable) to successors
            states = sorted(states + [(safe_get_score(state), state) for state in successors])[-fringe_limit:]
        if key is not None:
            cache[key] = best_state # ? call sense() instead
        return best_state


def clean_text(s):
    """TTS does far less than an ideal job of phrasing, saying acronyms and numbers, etc."""
    if isinstance(s, unicode):
        # http://www.fileformat.info/info/unicode/block/general_punctuation/list.htm
        s = s.translate(dict((k, ord(v)) for k, v in { 8216 : "'", # left single quote = 0x2018
                                                       8217 : "'", # right single quote
                                                       8220 : '"', # left double quote
                                                       8221 : '"', # right double quote
                                                       0x201E : '"',
                                                       0x201F : '"',
                                                       #8226 : '-', # bullet
                                                       #8211 : '-', # en dash
                                                       #8212 : '-', # em dash
                                                       #8230 : '...', doesn't work in translate()
                                                       160 : ' ', # non-breaking space = u'\xa0'
                                                       }.iteritems()))
        # 'Unicode character properties database' might tell me what is an apostrophe
        # 8217 = 0x2019
        for repl in { u'\u00B0' : ' degrees ',
                      # important to have spaces else the TTS will say 'x dash y' for 'x-y'
                      u'\u2013' : ' - ', # en dash
                      u'\u2014' : ' - ', # em dash # treat like a comma else the TTS say 'blah dash duh'; - still got that silly problem with ieee webpages
                      u'\u2022' : ' - ', # bullet
                      u'\u2026' : '...', # ellipsis
                      u'\x85' : "...",
                      u'\x91' : "'", # left single quote mark
                      u'\x92' : "'", # right single quote mark
                      u'\x93' : '"',
                      u'\x94' : '"',
                      u'\x95' : '-', # bullet
                      u'\x96' : '-', # en dash (support to be ' to ' for number ranges)
                      u'\x97' : '--', # em dash
                      u'\xa3' : ' pounds ', # if a str with that latin-1 char is decoded to unicode, it's still there
                      u'\xa9' : ' copyright ',
                      u'\xad' : ' - ', # soft hyphen latin-1
                      }.iteritems():
            s = s.replace(*repl)
    else:
        # http://www.pemberley.com/janeinfo/latin1.html from mcirosoft code page 1252, chars in the 128-159 range which still somehow appear in Python unicode strings
        # should really probably be done earlier elsewhere
        # ? could unicode strings stupidly contain these latin-1 chars; if so, was that why i had them decorated as u'\x85' etc.
        for repl in { '\x85' : "...",
                      '\x91' : "'", # left single quote mark
                      '\x92' : "'", # right single quote mark
                      '\x93' : '"',
                      '\x94' : '"',
                      '\x95' : '-', # bullet
                      '\x96' : '-', # en dash
                      '\x97' : '--', # em dash
                      '\xa9' : ' copyright ',
                      }.iteritems():
            s = s.replace(*repl)

    entities = dict( # http://www.mozilla.org/newlayout/testcases/layout/entities.html
        amp='&',
        quot='"',
        apos="'",
        lt='<',
        gt='>',
        nbsp=' ',
        copy=' copyright ',
        reg=' trademark ',
        middot=' ',
        mdash='--', # emphasis dash
        lsquo="'",
        rsquo="'",
        tilde='~',
        ldquo='"',
        rdquo='"',
        pound=' pounds ', # UK pound money sign
        )
    entity_re = re.compile(r"""
        &
        (
            \#\d+   # is either a pound (#) and numbers (would really either be a number or 'x' followed by a hex number)
            |       # or
            \w+     # has letters & numbers
        )
        ;
        """, re.VERBOSE)
    s = entity_re.sub(lambda m: entities.get(m.group(1), ' '), s)

    s = re.sub('#{2,}', ' ', s)
    s = re.sub('-+>', ' to ', s) # ex: 'english->german translater'
    s = re.sub(r'(?<=\w) *< *(?=\d)', ' less than ', s)
    s = re.sub(r'(?<=\w) *> *(?=\d)', ' greater than ', s)
    s = re.sub('[<>|_]', ' ', s) # SAPI seems to just not speak anything between angle brackets when in XML mode; but don't remove my <silence tags

    s = re.sub(r'(?<=[a-z]\.)(?=[A-Z][a-z])', ' ', s) # ex: '... journal Science.The new ...'
    s = re.sub(r'(?i)(?<=:)(?=[a-z])', ' ', s) # ex: 'Her mail follows:I know', but not for '12:34:56'

    # for pronouncing paths r'd:\music\zombie\scary.mp3'
    s = re.sub(r'(?i)(?<=\b[a-z]):(?=\\)', ' colon ', s)
    s = s.replace('\\', ' backslash ') # oddly '\\' is naturally pronounced 'backquote'

    number_regex = r'\d+(?:,?\s*\d{3})*(?:\.\d+)?'
    def parse_number(s):
        return float(re.sub('[, ]', '', s))

    s = re.sub(r"['`](?=[0-3]\d\b)", '20', s) # "found in '08."
    s = re.sub(r"['`](?=[4-9]\d\b)", '19', s)

    # all this cleaning a text for a TTS resembles lame patent http://www.freepatentsonline.com/5634084.html

    months = dict(zip(imap(str.lower, calendar.month_abbr[1:]), calendar.month_name[1:]))
    s = re.compile(r'\b(%s)(?=\s+\d+)' % '|'.join(months), re.I).sub(lambda m: months[m.group(1).lower()], s) # 'Mar 1, 2007' -> 'March first, 2007'

    # - recognize 2007-01-29 22:22:35 as date & time and say nicely; '2007-01-29' > 'January 29th, 2007',
    # 04/28/2006 M/D/Y
    #s = re.sub(r'\b(\d{1,2})/(\d{1,2})/(\d{4})\b', lambda m: time.strftime('%B %d, %Y', tuple(imap(int, [m.group(3), m.group(1), m.group(2)])) + (0,)*6), s) # has to be before the 'per' replacement below # didn't use this cuz leading 0s on day of month would be spoken
    # - unless the regexps are very tight for the month, they can cause index errors with the caelndar month lookup
    # - would be nice to just make a dict of all these date formats, since there's a lot of code repetition here; use named regex groups
    s = re.sub(r'\b(\d{4})/([01]\d|[1-9])/([0-3]\d)\b', lambda m: '%s %s, %s' % (calendar.month_name[int(m.group(2))], m.group(3).lstrip('0'), m.group(1)), s) # ex: 'YYYY/MM/DD'
    s = re.sub(r'\b([01]\d|[1-9])/([0-3]\d)/(\d{4})\b', lambda m: '%s %s, %s' % (calendar.month_name[int(m.group(1))], m.group(2).lstrip('0'), m.group(3)), s) # ex: 'MM/DD/YYYY'; has to be before the 'per' replacement below
    s = re.sub(r'\b(\d{4})-(\d{1,2})-(\d{2})\b', lambda m: '%s %s, %s' % (calendar.month_name[int(m.group(2))], m.group(3).lstrip('0'), m.group(1)), s) # '2007-06-12'
    s = re.sub(r'\b(\d{4})\.(\d{1,2})\.(\d{2})\b', lambda m: '%s %s, %s' % (calendar.month_name[int(m.group(2))], m.group(3).lstrip('0'), m.group(1)), s) # '2007.06.12'
    # - 20080207
    s = re.sub(r'\b([01]\d)-([123]\d)-(\d{2})\b', lambda m: '%s %s, 20%s' % (calendar.month_name[int(m.group(1))], m.group(2).lstrip('0'), m.group(3)), s) # M-D-Y; '1-31-07' -> 'jan 31, 2007'
    s = re.sub(r'\b(\d{1,2})/(?=(\d{1,2})\s+(\d{1,2}):(\d{2})\b)', lambda m: '%s ' % calendar.month_name[int(m.group(1))], s) # "Update: 03/11 17:24 GMT by"
    s = re.sub(r'\b(\d{4})([01]\d)(\d{2})\b', lambda m: '%s %s, %s' % (calendar.month_name[int(m.group(2))], m.group(3).lstrip('0'), m.group(1)), s) # '20080120' > 'January 20, 2008'

    # ? '1/10' -> '1 tenth'

    s = re.sub(r'\b(?<![-"])(%s)"' % number_regex, lambda m: '%s inch' % m.group(1), s) # for 'My 19" LCD' -- assuming that wasn't just a number at the end of a quote!! like '"24"' or '"SR-72"' or 'blah "Fixed in Build 327" duh'!!

    s = re.sub(r'\b(%s)\^(\d+)' % number_regex, lambda m: '%s to the power of %s' % m.groups(), s) # ' to the ' or ' exponent '

    s = re.sub(r'("|\'(?!\w)|(?<!\w)\')+', "<silence msec='350'/>", s) # remove quotes, except apostrophes, because AT&T occasionally speaks them

    # important to insert silences after i remove single quotes
    s = re.sub(r'(?<=\S)([ \t]*\n[ \t]*){2,}\s*(?=\S)|\n+[ \t]*(?=[A-Z])|\.{3,}', "\n<silence msec='750'/>\n", s) # stupid AT&T TTS or MS SAPI doesn't pause between words separated by more than 2 \n line separators # not quite ideal cuz it fails to handle space at the end of a line or in between!!
    #s = s.replace('/', ' per ') # '/' can mean 'per', 'divide', or 'or' depending on the context. would really be nice if this was all done teleoloigcally with each substituion as a pattern, and then the patterns could be taught in english and then would just need to forget the old texts when there's a new pattern that may change them!!! so at least move these substitions to patterns, and find() them here, have a grammar rule to create sub patterns from english, and have that forget all str patterns made by this effector and have this effector mention itself in the str patterns it effects

    s = re.compile(r'\bw/\s*(?=\w)', re.I).sub('with ', s) # 'w/something'
    s = re.sub(r'(?i)\b(%s)\s*/\s*(?=[a-z])' % number_regex, lambda m: '%s per ' % m.group(1), s) # ex: 5/hour -- only do this if the word is a unit: 'sec|min|hour|day|month|year|km|...' -- not 'Noke 1080/i phone', though rare
    #s = re.compile(r'([a-z]+)\s*/\s*([a-z]+)', re.I).sub(lambda m: '%s and %s' % m.groups(), s) # bad for slashes in URLs!!
    s = re.sub(r'(?i)(?<=\b[a-z])/(?=[a-z]\b)', ' ', s) # ex: 'A/V switch', 'I/O'
    s = re.sub(r'(?i)\band/or\b', 'and or', s)
    s = re.sub(r'(?i)[a-z]+(\s*/\s*[a-z]+)+', lambda m: ' and '.join(m.group().split('/')), s) # to handle 'xxx/yyy/zzz'
    s = re.sub(r'(?i)(?<=[a-z])/(?=\d)', ' ', s)

    number_unit_regex = r'(?i)%s\s*(?:m|cm|ft|feet|inches)?' % number_regex # - should have a dict of units combined with the rest in the dict below! - shouldn't there be \b after the unit names
    s = re.sub(r'(%s)\s*x\s*(%s)' % (number_unit_regex, number_unit_regex), lambda m: '%s by %s' % m.groups(), s)

    # phone numbers
    repl_phone_number = lambda m: " <silence msec='300'/> ".join(' '.join(n.strip('()')) for n in m.groups())
    for pattern in [r'(\(\d{3}\))\s*(\d{3})-(\d{4})\b', r'\b(\d{3})\.(\d{3})\.(\d{4})\b']:
        s = re.sub(pattern, repl_phone_number, s)

    # times: 12:34
    # ? what about this difference between '13 hours & 5 minutes' & '1 15 PM' -- how to distinguish? & '5:32' could min:sec, not hour:min; well, this is probably better than nothing
    s = re.sub(r'\b(\d+):(\d{1,2}):(\d{1,2})\b', lambda m: '%s hours, %s minutes and %s seconds' % tuple(s.lstrip('0') or 0 for s in m.groups()), s) # - proper pluralizing
    s = re.sub('\b(\d+):(\d{1,2})\b', lambda m: '%d hours and %d minutes' % m.groups(), s)

    #s = re.sub(r'\bSEO\b', 'search engine optimization', s)
    #s = re.sub(r'\bAISIP\b', ' Association of Independent Software Industry Professionals', s)
    # - these acronyms should be patterns in the universe!!!
    for abbrev, repl in { # stupid net acronyms; must do this before the acronym spacer below
            'IRL' : 'in real life',
            'POV' : 'point of view',
            'IIRC' : 'If I recall correctly',
            'AKA' : 'also known as',
            'IMO' : 'in my opinion',
            'IMHO' : 'in my opinion',
            'OTOH' : 'on the other hand',
            'LOL' : '',
            'm{3,}' : '',
            'HTH' : 'hope this helps',
            'PITA' : 'pain',
            'ANN' : 'announcement',
            'AFAIK' : 'as far as i know',
            'DIY' : 'do it yourself',
            'TIA' : ' ', # 'thanks in advance'
            'BTW' : 'by the way',
            'FWIW' : "for what it's worth",
            'GMT' : 'Greenwich Mean Time',
            'Fwd' : 'Forward',
            #'Re' : 'Regarding', # 're:\b' doesn't match 're: guh' because the \b doesn't match there -- can't be here because it would match "you're"
            #'gb' : 'gigabytes',
            #'mb' : 'megabytes',
            #'kb' : 'kilobytes',
            #'vs' : 'versus', # - phrasing problem when handling 'x vs. y' cuz the '.' remains; as usual, we have a context problem here because a '.' may indicate an abbreviation or a period
            'govt' : 'government',
            'exe' : 'executable',
            'IEEE' : 'I triple E', # - could automatically handle saying double/truple for repeated arconym letters
            'xbox' : 'X Box', # otherwise the 'x' is hardly pronounced
            'R\s*&\s*D' : 'research & development',
            'homepage' : 'home page', # AT&T pronounces 'homepage' as 'hom-a-page'
            'JPEG' : 'j peg',
            'VoIP' : 'voice over IP',
            #'SEC' : 'Securities and Exchange Commission', # of course, these could really be contextual -- 'sec' is an abbrev. of 'second'
            'GUI' : 'graphical user interface',
            'plugin' : 'plug in', # AT&T pronounces as 'pluge-in'
            'plugins' : 'plug ins',
            'WYSIWYG' : 'wiz e wig',
            'Ada' : 'ay duh', # otherwise AT&T pronouces as 'a duh'
            'wii' : 'we',
            'Tycho' : 'Tyko',
            'AT\s*&\s*T' : 'A T and T', # otherwise the 'AT' is pronounced like the preposition
            'F/X' : 'F X', # - spoiled by the ' and ' rule above
            r'(?<!\.)info' : 'information', # would be nice with abbreviations to also remove the '.', if any, when not also the end of he sentence!!!
            'et al' : 'and others',
            'nvidia' : 'n vidia',
            'youtube' : 'you tube',
            'i18n' : 'internationalization',
            #'BSoD' : 'Blue Screen of Death
            'LtU' : 'Lambda the Ultimate', # - how could this have been handled automatically? instead of becoming 'lieutenant U'
            'spaceflight' : 'space flight',
            '20/20' : '20 20', # else it says '20 slash 20'
            'WWII' : 'World War 2',
            'WWI' : 'World War 1',
            'ciao' : 'chow', # AT&T pronounced as 'psi ow'
            'kiosk' : 'key osk',
            'next[ -]?gen' : 'next generation', # AT&T pronounces 'gen' as 'general'
            #'LED' : 'light emitting diode' -- but what if it's just 'led' capitalized? need something smarter than this, contextual (knowing conditions) which are to complex to spell out, so must be learned, so the engine must learn from me
            'hm+' : 'hum', # AT&T say 'hmm' as 'h m m'
            'CO2' : 'carbon dioxide',
            'gmail' : 'g-mail',
            'nth' : 'enth', # like much else, Vista would have pronounced this properly -- though still, Vista would mispronounce many things, so the important thing is to have some natural way (ie, demonstration) to correct pronunciation
            }.iteritems():
        s = re.sub(r'(?i)\b%s\b(?!-)' % abbrev, repl, s)

    s = s.replace('&', ' and ') # for 'Thunder&Lightning'

    def roman_to_decimal(s):
        n = i = 0
        for numeral, integer in [('M',  1000), # the order of these pairs matters
                                 ('CM', 900),
                                 ('D',  500),
                                 ('CD', 400),
                                 ('C',  100),
                                 ('XC', 90),
                                 ('L',  50),
                                 ('XL', 40),
                                 ('X',  10),
                                 ('IX', 9),
                                 ('V',  5),
                                 ('IV', 4),
                                 ('I',  1)]:
            while s[i:i+len(numeral)] == numeral:
                n += integer
                i += len(numeral)
        return n
    def repl_roman_numeral(m):
        s = m.group()
        # 'IV' could be a number or an acronym; it's in the words file
        return str(roman_to_decimal(s)) if s.isupper() and (s == 'VI' or not is_word(s)) else s
    s = re.sub(r'\b(?!CV\b)M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|VI{0,3}|I{2,3}|XI{1,3})\b', repl_roman_numeral, s) # 'CV' has many other meanings
        # - if not is_word
    s = re.sub(r'(?i)\bpart +I\b', 'Part 1', s) # 'I' is very contextual

    # units can't be done in the top set of substitutions because units can be next to the # wo/space between
    # for '250MB', '5 GB', '$25k'
    for abbrev, repl in { # - should do many of these even if not preceded by a number
            'lbs' : 'pounds',
            'lb' : 'pound',
            'ft' : 'foot',
            'k' : 'thousand',
            'm' : 'million',
            'B(?!\.C)' : 'billion', # unreliable, because of, eg, 'Program 1.0b'; b as in beta; maybe allow '$ [number] b'
                # some places use '1.1b' as '1.1 billion', but others use '1.1b' as '1.1 beta' -- how to tell easily? also could be 'byte' (but not with 1.1); the solution is ultimately not fixed patterns but context & energy
            'bn' : 'billion',
            'kb' : 'kilobytes',
            'mb' : 'megabytes',
            'gb' : 'gigabytes',
            'kbps' : 'kilobits per second',
            'mbps' : 'megabits per second',
            'cm' : 'centimeter',
            'nm' : 'nanometer',
            'mm' : 'millmeter',
            'fps' : 'frames per second',
            'mhz' : 'mega hertz',
            'ghz' : 'giga hertz',
            'W' : 'watts',
            'mW' : 'milliwatts',
            'g' : 'gram', # but not for '3G' as in cell phones
            'kg' : 'kilograms', # or 'kilogram' depending on context! 'a 5 kg weight', 'weighs 5 kg'
            'mph' : 'miles per hour',
            'kph' : 'kilometers per hour',
            }.iteritems():
        #s = re.compile(r'(?<=\d\s*)%s\b' % abbrev, re.I).sub(repl, s) # look-behind must be fixed width
        #s = re.compile(r'(\d)\s*(%s)\b' % abbrev, re.I).sub(lambda m: '%s %s' % (m.group(1), repl), s)
        s = re.sub(r'%s(\b%s)\s*(%s)\b' % ('' if abbrev in ['B', 'g', 'W'] else '(?i)', number_regex, abbrev), lambda m: '%s %s' % (m.group(1), repl), s) # can't just go '(?<=\d)' because it would accidentally match 'alpha6b'

    # AT&T pronounces 1-2 digit numbers fine, but, eg, it pronounces '400000' as 'four zero zero zero ...'; '395' is pronounced '3 hundred 95' (no and)
    def format_hundreds(n):
        """format 1-4 digit numbers"""
        assert n < 10000
        if n < 100:
            return '%d' % n
        elif n % 100:
            return '%d hundred and %d' % (n // 100, n % 100)
        else:
            return '%d hundred' % (n // 100)
    def format_number(n, groups=['', 'thousand', 'million', 'billion', 'trillion']):
        decimal, n = __import__('math').modf(n)
        if 1000 < n < 10000 and (n // 100) % 10:
            s = format_hundreds(n) # eg: '45 hundred and 12'; this syntactic special case
        elif 100 < n < 1000 ** len(groups):
            parts = []
            for i, group in enumerate(groups):
                x = n // (1000 ** i)
                if x == 0:
                    break
                if x % 1000:
                    parts.insert(0, '%s %s' % (format_hundreds(x % 1000), group))
            s = ('%s and %s' % (', '.join(parts[:-1]), parts[-1])) if len(parts) > 1 else parts[0]
            #if len(parts) > 1:
            #    s = '%s and %s' % (', '.join(parts[:-1]), parts[-1])
            #elif parts:
            #    s = parts[0]
            #else: # n=1000000000000000, eg
            #    s = '%d' % n
        else:
            s = str(int(n))
        return ('%s point %s' % (s, ' '.join(str(decimal)[2:]))) if decimal else s
    def format_currency(x): # - should be in specialized text effector!!
        # apparently sapi will handle "$59.95" correctly, but it would probably needlessly say the cents for "$37.00"
        cents, dollars = __import__('math').modf(x)
        s = ''
        if dollars:
            s += '%s dollars' % format_number(dollars)
        if dollars and cents:
            s += ' and'
        if cents:
            s += ' %d cents' % (round(cents, 2) * 100)
        return s
    s = re.sub(r'(?i)\$(%s)\b(?!\.\d)(?!\s+(hundred|thousand|million|billion))' % number_regex, lambda m: format_currency(parse_number(m.group(1))), s) # - doesn't handle commas; the (?!\.\d) is just to keep it from cheating & matching only the '$1' in '$1.2 billion'
    # this really only seems to be needed because the operation below otherwise prevents the TTS from doing a pretty good job with '$24.32'., except that it would pronounce a cents of '.00', which is annoying
    # these 2 sort of compete
    s = re.sub(r'(?i)(\$|pounds)\s*(%s(?:\s*(-|to)\s*%s)?(?:\s*(hundred|thousand|million|billion))?)(\s+dollars)?\b' % (number_regex, number_regex), lambda m: '%s %s' % (m.group(2), {'$' : 'dollars'}.get(m.group(1), m.group(1))), s) # '$38-40 thousand' > '38 to 40 thousand dollars'; needs to be after expanding abbreviations of kilo etc, & before turning '4-5' > '4 to 5'; i ignore a trailing 'dollars' because of cases like '$3.5 million dollars'

    s = re.sub(r'(?<=[CV])s\b', "eas", s)
    s = re.sub(r'(?<=\w[A-Z])(?=s\b)', "'", s)
    #s = re.sub(r'(?<=\w[A-Z])(?<!\bPC)(?=s\b)', "'", s) # to fix: pronounces CPUs as 'c p us'; 'ISPs' -> 'i s p s'; 'MPs' - 'm p s'; "I S P's" pronounces right
    # v 'PCs' does get changed to "PC's" but for some odd reason, is pronounced as 'P C s' on the home machine, but my vista machine (but same AT&T TTS) pronounces PC's & PCs correctly
    # v didn't work for 'ULCPCs'; well, it worked on Vista wo/help, but it seems XP doesn't know to pronounce the end of that work like it does know how to pronounce PCs
    #s = re.sub(r"(?i)(?<=\d)'(?=s\b)", '', s) # "3.0's" was pronounced as '3 point 0 s'; is the speech synthesizer causing that, cuz i haven't found the code here that affects that apostrophe; i've looked over all the code & see nothing here causing this; yeah, the sythesizer's doing it; this solution didn't work; maybe turn the numbers into a word

    for abbrev, repl in {
            r'(?i)(?<=\d)\s*(A\.M\.|AM)(?!\S)' : ' a m ',
            r'(?i)(?<=\d)\s*(P\.M\.|PM)(?!\S)' : ' p m ',
            r'(?i)\be(\.?)g\.' : 'for example',
            r'(?i)\bi(\.e?)\.' : 'that is',
            r'(?i)\bn\.b\.' : '', # nota bene: note well
            r'(?i)\bprof(\.|\b)' : 'professor',
            r'\b[Aa]vg(\.|\b)|\b[Aa][Vv][Gg]\.' : 'average', # so it doesn't match 'AVG virus scanner'
            r'(?i)\besp\.' : 'especially',
            r'(?i)\bbros\.' : 'brothers',
            r'(?i)\bapprox\b(\.?)' : 'approximately',
            r'(?i)\bp\.\s*(?=\d)' : 'page ', # don't want to match 'P2' since that could be 'P2P' or who knows what
            r'(?i)\bpp(\.?)\s*(?=\d)' : 'pages ',
            r'(?i)(?<=\d)\s*pp(\.?)' : ' pages',
            r'(?i)\bvs(\.|\b)' : 'versus', # can safely eat the '.' since it's very unlikely to also be the end of the sentence
            r'(?i)\bno(\.|:)(?=\s+\d)' : 'number',
            r'(?i)\binc(\.|\b)' : 'incorporated',
            r'\bdB\b' : 'decibel',
            r'\b([dD]b)|(DB)\b' : 'd b', #'database', # AT&T annoyingly pronounces '(?i)db' as 'decibel' -- is there some raw mode for SAPI? to keep the TTS from being clever?!!
            r'\bCo\b' : 'co', # AT&T or SAPI again tries to be clever & pronounces 'Co' as 'Company'
            r'(?i)(?<=Cambridge, )Mass\b' : 'Massachusetts', # - slightly mishandles 'Cambridge, Mass. is fun.' as 'Cambridge, Massachusetts. is fun.'
            r'\bIT\b' : 'I T', #'information technology', # otherwise 'IT' is pronounced like 'it'; - handle 'domain.it' -- could be bad if it was just the word 'it' captialized in a title!!!
            r'(?i)\bRe:' : 'Regarding',
            r'[Hh]tml' :'HTML ', # ex: 'HtmlApp'
            r'\bDoS\b' : 'denial of service', # don't match 'DOS'
            r'(?i)vol(\.?)(?=\s*\d)' : 'volume',
            r'(?i)\bbookmark to:\s*$' : '',
            r"'[Nn]\b" : ' and',
            r'\b([A-G])#' : lambda m: '%s sharp ' % m.group(1), # 'C#3.0'?
            r'\b(%s)\s+C(?![-+])\b' % number_regex : lambda m: '%s degrees celsius' % m.group(1), # problem with 'W3C' because a rule above turns that into 'W 3 C' & then '3 C' is now seen as a temperature; what to do?! solution: do this before spacing letters & numbers
            r'(?i)(?<=\bdegrees)\s+C\b' : ' celsius',
            r'(?i)(?<=\bdegrees)\s+F\b' : ' fahrenheit',
            r'(?i)\(R\)' : ' ', # (R)=registered trademark
            r'\bmic\b' : 'microphone',
            r'\s*[:;][-@]?[)(DOpP]' : "<silence msec='400'/>", # infantile smilies http://members.aol.com/bearpage/smileys.htm
            #r'(?i)\bsponsored by ' : '.',
            r'\b[Ss]td\b' : 'standard', # caps STD is disease acronym
            }.iteritems():
        s = re.sub(abbrev, repl, s)

    s = re.sub(r'(?i)(?<=[a-z])(?=\d)|(?<=\d)(?=[a-z])(?!(st|nd|rd|th|s)\b)', ' ', s) # insert spaces between numbers & letters; the 's\b' exception is for '80s'

    s = re.sub(r'(?i)((?:[-\w]+\.)+)([A-Z]{2,})', lambda m: ' %s %s' % (m.group(1).upper().replace('.', ' dot '), (' '.join(m.group(2)) if len(m.group(2)) == 2 else m.group(2).upper())), s) # to pronounce domains like 'getgames.ca' in & out of e-mail addresses; has to be [A-Z], not \w for the last pattern else it matches '1.02' eg

    def process_word(m):
        s = m.group()
        return s if is_word(s) else re.sub(r'(?<=[a-z])(?<!\bMc)[A-Z]', lambda m: ' %s' % m.group(), s) # - 'CLARAty': 'clara' is in the dictionary, so maybe it would be best if this kept letters together if they're part of a (3+ letter) word -- or could that cause problems? didn't i try something similar before? i think there i would only letters, upper case or not, to remian together if they were part of a word, but here i would only do this for uppercase letters, so I don't split them up so much -- so where's the code for that? oh it's below, in process_word, but now only use it for upper case
    s = re.sub(r'(?i)\b[a-z]{2,}', process_word, s)

    def space_acronym(m):
        """
        Ex: 'UGOBE' -> 'U GO BE', instead of the more verbose 'U G O B E'; does 'TWiT' as 'twit', not 'T W i T'
        allows a sequence of letters to remain together for as far as they match a dictionary word
        only use on uppercase sequences of letters, since they'd be pronounced verbosely anyway, as acronyms; don't use on lowercase letters cuz the TTS can usually pronounce it in some sensible way even when it's not a known word (which is many: trying, kansas)
        ~ 'MEPs' -> became 'me ps'; why? it should be "M E P's"; i think another rule spoiled this; upper() fixed this too
        """
        term = m.group()
        lower_term = term.lower()
        if (len(lower_term) <= 3 and not is_word(lower_term)) or lower_term in ['rfid', 'us', 'la']:#, 'api', 'cmu', 'dna', 'url']:
            return ' '.join(term)
        def get_next_states((rest, words)):
            # CO2 should be 'C O 2' or 'carbon dioxide' but this likes to say 'CO' together, to sound nice, but maybe shouldn't if it's the whole word; actually, i just convert that abbreviation above
            for i in xrange(1, len(rest) + 1):
                part = rest[:i]
                if i == 1 or (is_word(part) and (part not in ['do', 'as', 'is', 'am', 'id', 'us', 'ma', 'ni', 'ms'] or part == rest)) or part in ['co', 're', 'com']: # 'or part == lower_term' is if the whole word is, eg, 'IS'
                    yield rest[i:], (words + (rest[:i],))
        best = search((lower_term, ()), lambda (rest, words): (-len(rest), -len(words)), get_next_states, time_limit=0.25, key=('acronym pronunciation', lower_term))
        return ' '.join(best[1]).upper() # converting to upper helps with 'HTML' which, because 'ht'&'ml' are in file('words'), becomes 'ht ml', where AT&T will pronounce 'ml' as the unit; upper() stops that
    s = re.sub(r'[A-Z]{2,}', space_acronym, s) # take every 2+ capital letters, & insert spaces wherever there isn't a dictionary word

    # handles extra captial letters in terms, assuming the term isn't just a capitalized word
    #s = re.sub(r'[A-Z]{2,}', lambda m: (' %s' if m.start() > 0 and s[m.start() - 1].isalnum() else '%s') % (m.group() if is_word(m.group()) else ' '.join(m.group())), s) # also handles, eg, 'DBMaster' -> ' D B Masters'; 'EasyNN-plus' # unless the surrounding text is all caps # ideally, only insert a leading space if there's a letter right before the acronym (otherwise the left parenthesis in '(MSN)' is spoken)
        # ' %s' % m.group() - because '1UP' would be pronounced as '1 U P'; i suppose because it started with a #

    #s = re.sub(r'(?<![-\w])-+(?!\d)', ' , ', s) # would pronounce dashes, so I remove them unless for a -ve number; I replace with a comma because it causes AT&T's TTS to pause

    #s = re.sub('-{2,}', ' ', s)

    s = re.sub(r'\b(?<!\d-)(%s)-(%s)(?!-)\b' % (number_regex, number_regex), lambda m: '%s to %s' % m.groups() if cmp(*map(parse_number, m.groups())) < 0 else m.group(), s) #lambda m: '%s to %s' % m.groups(), s) # '3-4' > '3 to 4'; could be bad for serial # i check if the 2nd # < 1st for expressions like '24-7'
    s = re.sub('-{2,}|\s+-\s+', " <silence msec='500'/> ", s)
    s = re.sub(r'-(?!\d)|(?<!\s)-', ' ', s) #s = re.sub(r'-+(?!\d)', ' ', s) # AT&T pronounced the dash in 'US-China', so I'm inclined to just remove any dash that isn't a minus

    s = re.sub(r"\b(?<!')%s\b" % number_regex, lambda m: format_number(parse_number(m.group())), s) # AT&T TTS doesn't even say regular numbers nicely; eg, for 1917 it says '1917'; the quote thing is to prevent it from doing this to the msec XML delays in speech

    # - having a problem with the paren replacement with commas i think: '''n Grand Unified Theories , aka GUTs, ."''''

    s = re.sub(r',(?!\s|\d)|(?<!\w|\d),', ' ', s) # because of text like '"blah duh!", guh'; a line above would replace the '"' in that example with a space
    s = s.replace('@', ' at ') # AT&T pronounces '@' as 'at sign'
    s = re.sub(r'\$+', '$', s) # '$$$'

    s = re.sub('[*`]', ' ', s)
    #s = s.replace('*', ' ') # naturally pronounces 'Live*' as 'L I V E *'
    #s = s.replace('`', ' ') # AT&T speaks backquotes

    # english voice audrey pronounces beta oddly, but the american voices say it correctly

    # - '$100 dog' vs 'my dog is worth 100$; in the 1st $=dollar, in the 2nd $=dollars -- what's the difference?
    #s = re.compile(r'\$(\d+(\.\d+)?)(?:\s*(B|billion)\b)', re.I).sub(lambda m: '%s billion dollars' % m.group(1), s) # AT&T mispronounces '$3.3 billion' as '3 $ . 3 billion'; '\d+\.\d+ (billion|million|thousand|hundred)' # "$2.3B"
    #s = re.compile(r'\$(\d+(\.\d+)?)(?:\s*(M|million)\b)', re.I).sub(lambda m: '%s million dollars' % m.group(1), s)

    s = re.sub(r'(?<!\d)\.(?=[a-z])', ' dot ', s) # prononce a period as 'dot' if it occurs at the start of a word, like '.com', '.net' but not '.2'; case sensitive because i don't want 'U.S.' -> 'U dot S.'

    s = re.sub(r'(?<!\S)~(?=\d)', ' approximately ', s) # '~20 years'
    s = s.replace('~', ' ')

    s = re.sub(r'\d{5,}', lambda m: ' '.join(m.group()), s) # pronounce long #s '18927302187' as individual digits, otherwise the TTS says 'million and' etc; for orders #s; 5 is the min. so years are said properly -- isn't this taken care of by something above?

    # - 'rapid city, SD' -> 'rapid city, south dakota'

    s = re.sub(r'(?<=\w)\(s\)', 's', s) # ex: 'computer(s)'

    #s = re.sub(r'[()](?![,.])', "<silence msec='250'/>", s) # replace parens (when not followed by other punctiation) with pauses
    s = re.sub(r'[():]', "<silence msec='250'/>", s) # well, could be '(.NET' where i don't want the 'left parenthesis' said
    # 1 item used colins in headings oddly: ":: HEADING ::\nBlah..."
    s = s.replace(')', '') # AT&T will speak lone right parenthesis
    s = re.sub(r'\[\d+\]', '', s) # footnote pointers: 'Duh blah [3]. Guh...'
    s = re.sub(r'[\[\]]', "<silence msec='250'/>", s)

    s = re.sub(r'(?<![\w%])[?!.]', ' ', s) # remove stand-alone question marks; ex: "Do you (I think I know the answer!)?"

    s = re.sub(r'(?i)\b(fuck|fart|cock(?!roach|fight)|crap|shit|bullshit|bitch|pimp|pussy)\w*', 'bleep', s)

    s = re.sub(r'(?i)\bneuro(?=\w{5})(?!l)', 'neuro-', s) # AT&T pronounces it as neur-oscience; not done in the table so it will work for 'neuroscientists', etc. neuropsychology; but not neurologist

    s = re.sub(r'(?i)\bdollars\s+(?=dollars\b)', '', s) # some texts stupidly say things like '$50 dollars'

    return s


if __name__ == '__main__':
    assert len(sys.argv) <= 2
    if len(sys.argv) > 1:
        src_path = sys.argv[1]
        s = file(src_path).read()
        wav_path = change_file_ext(src_path, 'wav')
    else:
        s = 'this is a test'
        wav_path = 'default.wav'
    #print wav_path; sys.exit()

    # a few fixes for my own book Mind Making
    for pattern, repl in [(r'(?sm)^Copyright.*?(?=For)', 'Copyright Patrick Roberts\n\nAll rights reserved\n\n'), (r'(?sm)^Clusters.*?(?=Lessons from a Machine Mind\s*$)', '\n\n'), (r'(?m)(?<=^\d\.)(?=[A-Z])', ' '), (r'(?i)cormind', 'Cor Mind')]:
        s = re.sub(pattern, repl, s)

    print 'Transforming text...'
    s = clean_text(s)
    #file(change_file_ext(wav_path, 'transformed'), 'w').write(s); sys.exit()

    stream = win32com.client.Dispatch('SAPI.SpFileStream')
    tts = win32com.client.Dispatch('SAPI.SpVoice')
    voices = dict([(os.path.basename(voice.Id), voice) for voice in tts.GetVoices()])
    #print voices; tts.Voice = voices['ATT-DT-14-Audrey16']

    stream.Open(wav_path, 3) #SSFMCreateForWrite=3
    tts.AudioOutputStream = stream

    print 'Writing speech...'
    tts.Speak(s, 0)
    stream.Close()

    #print 'Encoding mp3...'
    #mp3_path = change_file_ext(wav_path, 'mp3')
    #recodeAudio(wav_path, mp3_path, 'mp3')

    os.startfile(wav_path)
