#!/usr/bin/env python ## Name of the script: docx2txt ## by Avi Kak (kak@purdue.edu) September 8, 2009 ## Docx to Text Converter: ## ## This script produces an ASCII dump of a docx file. Note that the ## ASCII dump will not retain any of the format and style information ## of the original document. You are also forewarned this is just a ## poor man's text extractor that should give satisfactory results on ## most simple documents. It should be easy to extend the script to ## retain at least the paragraph level divisions in the original ## document. At this time, I simply discard all the markup that is ## output by the XML parser. One could retain some of the markup and ## create a better looking output. ## Call syntax for this script: ## ## docx2txt yourfilename.docx ## ## if you want to see the text in your terminal window, and ## ## docx2txt yourfilename.docx > outfile.txt ## ## if you want to direct the output into another file. In either ## case, make sure that you have made the script executable before ## you invoke it for text extraction. ## IMPORTANT NOTE ## ## ## Before you can use the current script, you must first download TestREX.pl ## from the link: ## ## http://cobweb.ecn.purdue.edu/~kak/scriptingwithobjects/swocode/chap17/TestREX.pl ## ## Place TestREX.pl in the same directory in which you want to extract ## ASCII from the docx files. [ By the way, the URL shown above (up to ## and including 'chap17') also contains TestREX.py that you can use ## instead of TestREX.pl provided you suitably modify the second line ## of the docx_to_parsed_xml() function defined below. ] ## CREDITS ## ## ## I first thought of writing this script when I saw a suggestion posted ## at: ## ## http://stackoverflow.com/questions/42482/best-way-to-extract-text-from-a-word-doc-without-using-com-automation ## ## by Guy Starbuck on how to extract raw ASCII from the docx files; he ## suggested extracting text from a docx file by putting it through an ## XML parser. That is exactly what I have tried to do in the current ## script. Since I did not want to spend more than an hour on this ## project and since I had lying around the XML parsing scripts from my ## book "Scripting With Objects", I decided to put those to service. ## But note that the primary credit for the XML parsing that is used ## here must go to Robert Cameron. He is the creator of the REX XML ## shallow parsing regular expression. Both the Perl script TestREX.pl ## and the Python script TestREX.py are based on Cameron's incredibly ## beautiful (and deliciously complex) regular expression called REX. ## HELP ## ## ## If you see any problems with this script or if you would like to see ## this script extended, send a note to Avi Kak at kak@purdue.edu with ## "docx2txt" in the subject line to get past my otherwise extremely ## mean spam filter. ## CHANGE HISTORY ## ## ## Damiano Ferrari brought to my attention the fact that the previous ## version of this script had a problem with filenames with spaces in ## them. This version has fixed that problem. But since it is not ## uncommon for well-intentioned changes to have unforeseen ## consequences, the previous version of this script is still available ## as ## ## docx2txt.v0 ## ## at the same location where you found docx2txt. Thanks Damiano. import os, sys, re, tempfile, subprocess if len(sys.argv) != 2: #(1) sys.exit("Call syntax: script_name input_docx_filename" ) #(2) ## NOTE: You must first download TestREX.pl from the link given above ## for the function below to work. def docx_to_parsed_xml(cwd, filename): #(3) p = subprocess.Popen( cwd + "/" + "TestREX.pl " + filename, \ shell=True, \ stdin=subprocess.PIPE, \ stdout=subprocess.PIPE, \ stderr=subprocess.PIPE) #(4) (child_i, child_o, child_e) = (p.stdin, p.stdout, p.stderr) # os.waitpid(p.pid, 0) #(5) child_i.close() #(6) retval = child_o.read() #(7) erroroutput = child_e.read() #(8) child_o.close() #(9) child_e.close() #(10) if not erroroutput: #(11) return retval #(12) else: #(13) raise OSError("Command execution caused error: %s" \ % erroroutput) #(14) filename = sys.argv[1] #(15) if not filename.endswith( ".docx" ): #(16) raise IOError("Only docx files are processed by this script") #(17) cwd = os.getcwd() #(18) tempdir = tempfile.mkdtemp() #(19) os.chdir( tempdir ) #(20) subprocess.call( ["cp", cwd + "/" + filename, "tempdocxfile"] ) #(21) p = subprocess.Popen( "unzip " + "tempdocxfile", shell=True, \ stdin=subprocess.PIPE, \ stdout=subprocess.PIPE, \ stderr=subprocess.PIPE) (child_i,child_o,child_e) = (p.stdin, p.stdout, p.stderr) #(22) os.waitpid(p.pid, 0) #(23) child_i.close() #(24) erroroutput = child_e.read() #(25) child_e.close() #(26) child_o.close() #(27) if erroroutput: #(28) raise OSError("Executing the command caused an error: %s" \ % erroroutput) #(29) os.chdir( "word" ) #(30) textdocstring = docx_to_parsed_xml( cwd, "document.xml" ) #(31) regex = re.compile(r'^Item \d+:[ ]+([^ ]*<[^<>]*>$)?', re.MULTILINE)#(32) regex2 = re.compile( r'\s+', re.MULTILINE ) #(33) textdocstring = re.sub( regex, '', textdocstring ) #(34) textdocstring = re.sub( regex2, ' ', textdocstring ) #(35) ## If you want to see the output as one long string with no linebreaks, ## uncomment the following and comment out the rest of the script: # print textdocstring #(36) ## The rest of the script is really not necessary if you want the ASCII ## to be dumpted out as a single string with no linebreaks. I ## personally find it more difficult to read that kind of output. So ## the following code fragment breaks the text at the sentence endings ## --- sometimes with funny results: sentences = re.split( r'([^ ][.?!]\s)', textdocstring ) #(37) previousitem = '' #(38) for item in sentences: #(39) item = re.sub( r'Number of tokens extracted: \d*', '', item ) #(40) if re.match( r'[^ ][.?!]', item ): #(41) item = previousitem + item #(42) print item #(43) continue #(44) previousitem = item #(45) os.chdir(cwd) #(46) os.system( "rm -rf " + tempdir ) #(47)