0

我确信这个问题可以相对容易地解决,但我正在努力寻找问题。我的代码只是从文件中读取所有单词,然后将每个单词、单词位置、句子的开头和结尾存储在一个数组中。数组被输出到另一个文本文件。

我可以阅读直到最后一句话的所有信息,然后我有一个错误。有什么想法吗?

/**
 *  Programmer: fryeguy
 *  Course: 
 *  Program: TxtCrawl for MicroSearch
 *
 *  Algorithm:
 *  TxtCrawl is the component of MicroSearch that reads text
 *  documents for search terms and stores them for
 *  indexing
 *
 *  1. Count words in doc, then initialize
 *     wordsFromDoc array to wordCount
 *  2. Initiate output file for writing.
 *  3. Open input file for reading words.
 *  4. Until reaching EOF:
 *     4.a. Set value for start "get pointer" in startSentence (.tellg()).
 *     4.b. Store value for end "get pointer" in endSentence (.tellg()).
 *     4.c. Reset "get pointer" to startSentence location.
 *     4.d. Until reaching endSentence, Read into the
 *          array theWord, wordPos, startSent, and endSent
 *  5. Write wordsFromDoc array to file
 *  6. When EOF is reached close the files.
 */

#include <iostream>
#include <iomanip>
#include <fstream>
#include <string>   

using namespace std;

struct wordProps        // stores word info to be placed in array
{
    string  theWord;    // stores the word
    int     wordPos;    // stores the position of word
    int     startSent;  // stores the start point of the sentence
    int     endSent;    // stores the end point of the sentence
};

void countWords(string, int&, int&);

int main()
{

    ifstream iFile; // file stream for reading in data
    ofstream oFile; // file stream for writing data

    string  iFileName = "TextFile2.txt";    // name of test file to read from
    string  oFileName = "OutputFile.txt";   // name of test file to write to
    string  aLine = "";                     // stores a line preceeding a newline character (\n)
    string  aWord = "";                     // stores words from doc for indexing
    int     charCount = 0;                  // count of characters in doc
    int     wordCount = 0;                  // count of words in doc
    int     aLineWordCount = 0;             // count of words in a single line being processed
    int     wordBegin = 0;                  // stores location of word in doc
    int     startSentence = 0;              // stores pointer value for start of sentence
    int     endSentence = 0;                // stores pointer value for end of sentence

    /**
     * 1. Count words in doc, then initialize
     *    wordsFromDoc array to wordCount
     */
    countWords(iFileName, charCount, wordCount);
    cout << "charCount: " << charCount << endl; // DEBUG CODE
    cout << "wordCount: " << wordCount << endl; // DEBUG CODE
    wordProps wordsFromDoc[wordCount];
    cout<< "length of array: " << (sizeof(wordsFromDoc) / sizeof(*wordsFromDoc)) << endl;  // DEBUG CODE

    /**
     * 2. Initiate output file for writing
     */
    oFile.open (oFileName.c_str()); // setup output file and write header
    oFile << setw(20) << left << "File Name: " << iFileName << endl;
    oFile << setw(20) << "---------------------------------------" << endl << endl;

    /**
     * 3. Open input file for reading words
     */
    iFile.open (iFileName.c_str());
    if (!iFile.is_open())
        cout << "No such file exists!" << endl;
    else
    {
        /**
         * 4. Until reaching EOF:
         */
        // I have been attempting different counting methods assuming the eof was being reached prematurely
        // The results really have not varied with this code
        // while (iFile.tellg() != charCount) 
        while (!iFile.eof())
        {
            //cout << "count: " << count << endl;
            /**
             * 4.a. Set value for start "get pointer" in startSentence (.tellg()).
             */
            startSentence = iFile.tellg();
            cout << "startSentence: " << startSentence << endl; // DEBUG CODE

            /**
             * 4.b. Store value for end "get pointer" in endSentence (.tellg()).
             */
            getline(iFile, aLine, '.');
            cout << aLine << endl; // DEBUG CODE
            endSentence = iFile.tellg();
            aLine.clear();
            cout << "endSentence: " << endSentence << endl; // DEBUG CODE

            if (!iFile.is_open())
            {
                cout << "The if, iFile.tellg(): " << iFile.tellg() << endl; // DEBUG CODE
                iFile.close();
                iFile.open (iFileName.c_str());
            }

            /**
             * 4.c. Reset "get pointer" to startSentence location.
             */
            iFile.seekg(startSentence);
            cout << "iFile.tellg(): " << iFile.tellg() << endl; // DEBUG CODE

            /**
             * 4.d. Until reaching endSentence, Read into the
             *      array theWord, wordPos, startSent, and endSent
             */

             // As the last line is about to be read there is an error of some sort.
             // My guess is that somehow I exceed the end of the file but my startSentence
             // and endSentence variables are pointing where I think they should.

            for ( ; iFile.tellg() < endSentence; aLineWordCount++)
            {
                wordsFromDoc[aLineWordCount].wordPos = iFile.tellg();
                cout << "wordPos: " << wordsFromDoc[aLineWordCount].wordPos << endl; // DEBUG CODE
                iFile >> wordsFromDoc[aLineWordCount].theWord;
                cout << "theWord: " << wordsFromDoc[aLineWordCount].theWord << endl; // DEBUG CODE
                wordsFromDoc[aLineWordCount].startSent = startSentence;
                cout << "startSent: " << wordsFromDoc[aLineWordCount].startSent << endl; // DEBUG CODE
                wordsFromDoc[aLineWordCount].endSent = endSentence;
                cout << "endSent: " << wordsFromDoc[aLineWordCount].endSent << endl << endl; // DEBUG CODE
                cout << "aLineWordCount: " << aLineWordCount << endl;
            } // end for

        } // end while !=iFile.eof

            // THIS section of code is never reached because of the hang up above.
            /**
             * 5. Write wordsFromDoc array to file
             */
            for (int count = 0; count < aLineWordCount; count++)
            {
                oFile << setw(20) << left
                << wordsFromDoc[count].theWord << " "
                << wordsFromDoc[count].wordPos << " "
                << wordsFromDoc[count].startSent << " "
                << wordsFromDoc[count].endSent << endl;
            }

    } // end else

    /**
     * 6. When EOF is reached close the files.
     */
    iFile.close();
    oFile.close();

// DEBUG CDODE for verifying results
//  for (int count = 0; count < wordCount; count++) {
//      cout << "theWord: " << wordsFromDoc[count].theWord << endl;
//      cout << "wordPos: " << wordsFromDoc[count].wordPos << endl;
//      cout << "startSent: " << wordsFromDoc[count].startSent << endl;
//      cout << "endSent: " << wordsFromDoc[count].endSent << endl << endl;
//  }

}

/**
 * Implement countWords function
 */
void countWords(string theFileName, int &charCount, int &wordCount)
{
    string  theWord = "";
    char    theChar = ' ';
    fstream inFile;

    //count the chars
    inFile.open (theFileName.c_str());
    if (!inFile.is_open())
        cout << "No such file exists!" << endl;
    else
    {
        inFile.get(theChar);
        while (!inFile.eof())
        {
            charCount++;
            inFile.get(theChar);
        }
    }
    inFile.close();

    // count the words
    inFile.open (theFileName.c_str());
    if (!inFile.is_open())
        cout << "No such file exists!" << endl;
    else
    {
        while (!inFile.eof())
        {
            inFile >> theWord;
            wordCount++;
        }
    }
    inFile.close();
}
4

1 回答 1

1

我检查了。Istream 没有getgetline的化身, 一次处理多个分隔符1

其他人也有同样的问题2。逐字符 IO 是最实用的解决方案。其他解决方案涉及对当前 Istream 方法的增强版本进行编码。

一个主意

  1. 一次将整个文件读入内存。
  2. 删除换行符(任何 CR 或 LF)。
  3. 在将文档写回磁盘时,通过在每个分隔符之后放置一致的标记(LF 或 ETX '\003'),将文档拆分为以每个特殊 句号分隔符结尾的行。
  4. 现在可以照常处理文件了;但使用已知标记而不是句点作为分隔符。
  5. 删除保存重新分隔文档的临时文件。

一次阅读整个文档不是问题,因为它最终都在内存中;将单词放在一起的字符串等于整个文档。一旦将重新定界的文档写入磁盘,就可以释放内存。

笔记

1 Istream::get
2带 getline 的多个分隔符(在 Code Guru 上讨论)

于 2010-11-21T05:36:53.053 回答