#!/usr/bin/env python

##  Name of the script:  docx2txt

##  by Avi Kak (kak@purdue.edu)   September 8, 2009


##  Docx to Text Converter:
##
##  This script produces an ASCII dump of a docx file.  Note that the
##  ASCII dump will not retain any of the format and style information
##  of the original document.  You are also forewarned this is just a
##  poor man's text extractor that should give satisfactory results on
##  most simple documents.  It should be easy to extend the script to
##  retain at least the paragraph level divisions in the original
##  document.  At this time, I simply discard all the markup that is
##  output by the XML parser.  One could retain some of the markup and
##  create a better looking output.


##  Call syntax for this script:
##
##      docx2txt  yourfilename.docx
##
##  if you want to see the text in your terminal window, and
##
##      docx2txt  yourfilename.docx   >    outfile.txt
##
##  if you want to direct the output into another file.  In either 
##  case, make sure that you have made the script executable before 
##  you invoke it for text extraction.



##                           IMPORTANT NOTE
##
##
##  Before you can use the current script, you must first download TestREX.pl
##  from the link:
##
##     http://cobweb.ecn.purdue.edu/~kak/scriptingwithobjects/swocode/chap17/TestREX.pl
##
##  Place TestREX.pl in the same directory in which you want to extract
##  ASCII from the docx files.  [ By the way, the URL shown above (up to
##  and including 'chap17') also contains TestREX.py that you can use
##  instead of TestREX.pl provided you suitably modify the second line
##  of the docx_to_parsed_xml() function defined below. ]



##                              CREDITS
##
##
##  I first thought of writing this script when I saw a suggestion posted
##  at:
##
##      http://stackoverflow.com/questions/42482/best-way-to-extract-text-from-a-word-doc-without-using-com-automation
##
##  by Guy Starbuck on how to extract raw ASCII from the docx files; he
##  suggested extracting text from a docx file by putting it through an
##  XML parser.  That is exactly what I have tried to do in the current
##  script.  Since I did not want to spend more than an hour on this
##  project and since I had lying around the XML parsing scripts from my
##  book "Scripting With Objects", I decided to put those to service.
##  But note that the primary credit for the XML parsing that is used
##  here must go to Robert Cameron.  He is the creator of the REX XML
##  shallow parsing regular expression.  Both the Perl script TestREX.pl
##  and the Python script TestREX.py are based on Cameron's incredibly
##  beautiful (and deliciously complex) regular expression called REX.



##                              HELP
##
##
##  If you see any problems with this script or if you would like to see
##  this script extended, send a note to Avi Kak at kak@purdue.edu with
##  "docx2txt" in the subject line to get past my otherwise extremely
##  mean spam filter.


##                         CHANGE HISTORY
##
##
##  Damiano Ferrari brought to my attention the fact that the previous
##  version of this script had a problem with filenames with spaces in
##  them.  This version has fixed that problem.  But since it is not
##  uncommon for well-intentioned changes to have unforeseen
##  consequences, the previous version of this script is still available
##  as
##
##       docx2txt.v0
##
##  at the same location where you found docx2txt.  Thanks Damiano.




import os, sys, re, tempfile, subprocess

if len(sys.argv) != 2:                                               #(1)
    sys.exit("Call syntax: script_name input_docx_filename" )        #(2)

##  NOTE: You must first download TestREX.pl from the link given above
##  for the function below to work.  
def docx_to_parsed_xml(cwd, filename):                               #(3)
    p = subprocess.Popen( cwd + "/" + "TestREX.pl " + filename, \
                     shell=True, \
                     stdin=subprocess.PIPE, \
                     stdout=subprocess.PIPE, \
                     stderr=subprocess.PIPE)                         #(4)
    (child_i, child_o, child_e) = (p.stdin, p.stdout, p.stderr)
#    os.waitpid(p.pid, 0)                                            #(5)
    child_i.close()                                                  #(6)
    retval = child_o.read()                                          #(7)
    erroroutput = child_e.read()                                     #(8)
    child_o.close()                                                  #(9)
    child_e.close()                                                 #(10)
    if not erroroutput:                                             #(11)
        return retval                                               #(12)
    else:                                                           #(13)
        raise OSError("Command execution caused error: %s" \
                                    % erroroutput)                  #(14)

filename = sys.argv[1]                                              #(15)
if not filename.endswith( ".docx" ):                                #(16)
    raise IOError("Only docx files are processed by this script")   #(17)
cwd = os.getcwd()                                                   #(18)
tempdir = tempfile.mkdtemp()                                        #(19)
os.chdir( tempdir )                                                 #(20)
subprocess.call( ["cp", cwd + "/" + filename, "tempdocxfile"] )     #(21)  
p = subprocess.Popen( "unzip " + "tempdocxfile", shell=True, \
                     stdin=subprocess.PIPE, \
                     stdout=subprocess.PIPE, \
                     stderr=subprocess.PIPE)
(child_i,child_o,child_e) = (p.stdin, p.stdout, p.stderr)           #(22)
os.waitpid(p.pid, 0)                                                #(23)
child_i.close()                                                     #(24)
erroroutput = child_e.read()                                        #(25)
child_e.close()                                                     #(26)
child_o.close()                                                     #(27)
if erroroutput:                                                     #(28)
    raise OSError("Executing the command caused an error: %s" \
                                    % erroroutput)                  #(29)
os.chdir( "word" )                                                  #(30)
textdocstring = docx_to_parsed_xml( cwd, "document.xml" )           #(31)
regex = re.compile(r'^Item \d+:[ ]+([^ ]*<[^<>]*>$)?', re.MULTILINE)#(32) 
regex2 = re.compile( r'\s+', re.MULTILINE )                         #(33)
textdocstring = re.sub( regex, '', textdocstring )                  #(34)
textdocstring = re.sub( regex2, ' ', textdocstring )                #(35)

##  If you want to see the output as one long string with no linebreaks,
##  uncomment the following and comment out the rest of the script:
# print textdocstring                                               #(36)

##  The rest of the script is really not necessary if you want the ASCII
##  to be dumpted out as a single string with no linebreaks.  I
##  personally find it more difficult to read that kind of output.  So
##  the following code fragment breaks the text at the sentence endings
##  --- sometimes with funny results:
sentences = re.split( r'([^ ][.?!]\s)', textdocstring )             #(37)
previousitem = ''                                                   #(38)
for item in sentences:                                              #(39)
    item = re.sub( r'Number of tokens extracted: \d*', '', item )   #(40)
    if re.match( r'[^ ][.?!]', item ):                              #(41)
        item = previousitem + item                                  #(42)
        print item                                                  #(43)
        continue                                                    #(44)
    previousitem = item                                             #(45)
os.chdir(cwd)                                                       #(46)
os.system( "rm -rf " + tempdir )                                    #(47)

