macro to generate bigrams in Writer - libreoffice

How do I generate bigrams using basic language?
I can do that in Python like this...
import nltk, sys
from nltk.tokenize import word_tokenize
sys.stdout = open("mygram1.txt", "w")
with open("mytext.txt") as f:
for text in f:
tokens = nltk.word_tokenize(text)
bigrm = (nltk.bigrams(tokens))
print(*map(' '.join, bigrm), sep='\n')
But I need a macro that I can run in Libreoffice writer. I do not want to use Python.
Update:
just like bigrams, nltk has trigrams method that I call using nltk.trigrams And if I need four or five grams there is everygrams!
from nltk import everygrams
import nltk, sys
from nltk.tokenize import word_tokenize
sys.stdout = open("myfourgram1.txt", "w")
with open("/home/ubuntu/mytext.txt") as f:
for text in f:
tokens = nltk.word_tokenize(text)
for i in list(everygrams(tokens, 4, 4)):
print((" ".join(i)))
Is it possible in libreoffice basic?

You could replicate the behaviour of your Python code by recycling the code in my answer to your previous question (Can you Print the wavy lines generated by Spell check in writer?). First strip out all the stuff relating to spell checking, generating alternatives and sorting, thereby making it considerably shorter, and change the line that inserts the results into the new document to make it just insert pairs of words. Rather than having your input text in a .txt file, you would have to put them into a writer document, and the results would appear in a new writer document.
It should look something like the listing below. This also includes the subsidiary function IsWordSeparator()
Option Explicit
Sub ListBigrams
Dim oSource As Object
oSource = ThisComponent
Dim oSourceCursor As Object
oSourceCursor = oSource.getText.createTextCursor()
oSourceCursor.gotoStart(False)
oSourceCursor.collapseToStart()
Dim oDestination As Object
oDestination = StarDesktop.loadComponentFromURL( "private:factory/swriter", "_blank", 0, Array() )
Dim oDestinationText as Object
oDestinationText = oDestination.getText()
Dim oDestinationCursor As Object
oDestinationCursor = oDestinationText.createTextCursor()
Dim s As String, sParagraph As String, sPreviousWord As String, sThisWord As String
Dim i as Long, j As Long, nWordStart As Long, nWordEnd As Long, nChar As Long
Dim bFirst as Boolean
sPreviousWord = ""
bFirst = true
Do
oSourceCursor.gotoEndOfParagraph(True)
sParagraph = oSourceCursor.getString() & " " 'It is necessary to add a space to the end of
'the string otherwise the last word of the paragraph is not recognised.
nWordStart = 1
nWordEnd = 1
For i = 1 to Len(sParagraph)
nChar = ASC(Mid(sParagraph, i, 1))
If IsWordSeparator(nChar) Then '1
If nWordEnd > nWordStart Then '2
sThisWord = Mid(sParagraph, nWordStart, nWordEnd - nWordStart)
If bFirst Then
bFirst = False
Else
oDestinationText.insertString(oDestinationCursor, sPreviousWord & " " & sThisWord & Chr(13), False)
EndIf
sPreviousWord = sThisWord
End If '2
nWordEnd = nWordEnd + 1
nWordStart = nWordEnd
Else
nWordEnd = nWordEnd + 1
End If '1
Next i
Loop While oSourceCursor.gotoNextParagraph(False)
End Sub
'----------------------------------------------------------------------------
' OOME Listing 360.
Function IsWordSeparator(iChar As Long) As Boolean
' Horizontal tab \t 9
' New line \n 10
' Carriage return \r 13
' Space 32
' Non-breaking space 160
Select Case iChar
Case 9, 10, 13, 32, 160
IsWordSeparator = True
Case Else
IsWordSeparator = False
End Select
End Function
Even if it would be easier to do it in Python, as Jim K suggested, the BASIC approach would make it easier to distribute the functionality to users, since they would not have to install Python and the NLTK library (which is not straightforward).

Related

How to split word files by the number of characters

Could you anybody help me how to split word file by character!
I can't find any way to split word files by the number of characters on the internet!
For example, to split a document into 500-character blocks:
Sub SplitDocument()
Application.ScreenUpdating = False
Dim Rng As Range, i As Long, j As Long
Const Char As Long = 500
With ActiveDocument
' Process each character block
For i = 1 To Int(.Characters.Count / Char)
j = j + 1
' Get the character block
Set Rng = .Range((i - 1) * Char, i * Char)
' Copy the character block
Rng.Copy
Rng.Collapse wdCollapseEnd
Call NewDoc(ActiveDocument, (i - 1) * Char + 1, j)
Next
If Rng.End < .Range.End Then
i = i + 1: j = j + 1
Rng.End = .Range.End
' Copy the range
Rng.Copy
Rng.Collapse wdCollapseEnd
Call NewDoc(ActiveDocument, (i - 1) * Char + 1, j)
End If
End With
Set Rng = Nothing
Application.ScreenUpdating = True
End Sub
Sub NewDoc(DocSrc As Document, i As Long, j As Long)
Dim DocTgt As Document, HdFt As HeaderFooter
' Create the output document
Set DocTgt = Documents.Add(Visible:=False)
With DocTgt
' Paste contents into the output document, preserving the formatting
.Range.PasteAndFormat (wdFormatOriginalFormatting)
' Replicate the headers & footers
For Each HdFt In DocSrc.Sections(DocSrc.Characters(i).Sections(1).Index).Headers
.Sections(1).Headers(HdFt.Index).Range.FormattedText = HdFt.Range.FormattedText
Next
For Each HdFt In DocSrc.Sections(DocSrc.Characters(i).Sections(1).Index).Footers
.Sections(1).Footers(HdFt.Index).Range.FormattedText = HdFt.Range.FormattedText
Next
' Save & close the output document
.SaveAs FileName:=Split(DocSrc.FullName, ".doc")(0) & "_" & j & ".docx", _
FileFormat:=wdFormatXMLDocument, AddToRecentFiles:=False
.Close SaveChanges:=False
End With
Set DocTgt = Nothing: Set DocSrc = Nothing
End Sub

Can you Print the wavy lines generated by Spell check in writer?

As per google group, this macro can be used to print mis-spelled words in MS office.
https://groups.google.com/g/microsoft.public.word.spelling.grammar/c/OiFYPkLAbeU
Is there similar option in libre-office writer?
The following Subroutine replicates what the code in the Google group does. It is more verbose than the MS version but that is to be expected with LibreOffice / OpenOffice. It only does the spellchecker lines and not the green grammar checker ones, which is also the case with the MS version in the Google group.
Sub UnderlineMisspelledWords
' From OOME Listing 315 Page 336
GlobalScope.BasicLibraries.loadLibrary( "Tools" )
Dim sLocale As String
sLocale = GetRegistryKeyContent("org.openoffice.Setup/L10N", FALSE).getByName("ooLocale")
' ooLocale appears to return a string that consists of the language and country
' seperated by a dash, e.g. en-GB
Dim nDash As Integer
nDash = InStr(sLocale, "-")
Dim aLocale As New com.sun.star.lang.Locale
aLocale.Language = Left(sLocale, nDash - 1)
aLocale.Country = Right(sLocale, Len(sLocale) -nDash )
Dim oSpeller As Variant
oSpeller = createUnoService("com.sun.star.linguistic2.SpellChecker")
Dim emptyArgs() as new com.sun.star.beans.PropertyValue
Dim oCursor As Object
oCursor = ThisComponent.getText.createTextCursor()
oCursor.gotoStart(False)
oCursor.collapseToStart()
Dim s as String, bTest As Boolean
Do
oCursor.gotoEndOfWord(True)
s = oCursor.getString()
bTest = oSpeller.isValid(s, aLocale, emptyArgs())
If Not bTest Then
With oCursor
.CharUnderlineHasColor = True
.CharUnderlineColor = RGB(255, 0,0)
.CharUnderline = com.sun.star.awt.FontUnderline.WAVE
' Possible alternatives include SMALLWAVE, DOUBLEWAVE and BOLDWAVE
End With
End If
Loop While oCursor.gotoNextWord(False)
End Sub
This will change the actual formatting of the font to have a red wavy underline, which will print out like any other formatting. If any of the misspelled words in the document already have some sort of underlining then that will be lost.
You will probably want to remove the underlining after you have printed it. The following Sub removes underlining only where its style exactly matches that of the line added by the first routine.
Sub RemoveUnderlining
Dim oCursor As Object
oCursor = ThisComponent.getText.createTextCursor()
oCursor.gotoStart(False)
oCursor.collapseToStart()
Dim s as String, bTest As Boolean
Do
oCursor.gotoEndOfWord(True)
Dim bTest1 As Boolean
bTest1 = False
If oCursor.CharUnderlineHasColor = True Then
bTest1 = True
End If
Dim bTest2 As Boolean
bTest2 = False
If oCursor.CharUnderlineColor = RGB(255, 0,0) Then
bTest2 = True
End If
Dim bTest3 As Boolean
bTest3 = False
If oCursor.CharUnderline = com.sun.star.awt.FontUnderline.WAVE Then
bTest3 = True
End If
If bTest1 And bTest2 And bTest3 Then
With oCursor
.CharUnderlineHasColor = False
.CharUnderline = com.sun.star.awt.FontUnderline.NONE
End With
End If
Loop While oCursor.gotoNextWord(False)
End Sub
This will not restore any original underlining that was replaced by red wavy ones. Other ways of removing the wavy lines that would restore these are:
Pressing undo (Ctrl Z) but you will need to do that once for every word in your document, which could be a bit of a pain.
Running the subroutine UnderlineMisspelledWords on a temporary copy of the document and then discarding it after printing.
I hope this is what you were looking for.
In response to your above comment, it is straightforward to modify the above subroutine to do that instead of drawing wavy lines. The code below opens a new Writer document and writes into it a list of the misspelled words together with the alternatives that the spellchecker suggests:
Sub ListMisSpelledWords
' From OOME Listing 315 Page 336
GlobalScope.BasicLibraries.loadLibrary( "Tools" )
Dim sLocale As String
sLocale = GetRegistryKeyContent("org.openoffice.Setup/L10N", FALSE).getByName("ooLocale")
' ooLocale appears to return a string that consists of the language and country
' seperated by a dash, e.g. en-GB
Dim nDash As Integer
nDash = InStr(sLocale, "-")
Dim aLocale As New com.sun.star.lang.Locale
aLocale.Language = Left(sLocale, nDash - 1)
aLocale.Country = Right(sLocale, Len(sLocale) -nDash )
Dim oSource As Object
oSource = ThisComponent
Dim oSourceCursor As Object
oSourceCursor = oSource.getText.createTextCursor()
oSourceCursor.gotoStart(False)
oSourceCursor.collapseToStart()
Dim oDestination As Object
oDestination = StarDesktop.loadComponentFromURL( "private:factory/swriter", "_blank", 0, Array() )
Dim oDestinationText as Object
oDestinationText = oDestination.getText()
Dim oDestinationCursor As Object
oDestinationCursor = oDestinationText.createTextCursor()
Dim oSpeller As Object
oSpeller = createUnoService("com.sun.star.linguistic2.SpellChecker")
Dim oSpellAlternatives As Object, emptyArgs() as new com.sun.star.beans.PropertyValue
Dim sMistake as String, oSpell As Object, sAlternatives() as String, bTest As Boolean, s As String, i as Integer
Do
oSourceCursor.gotoEndOfWord(True)
sMistake = oSourceCursor.getString()
bTest = oSpeller.isValid(sMistake, aLocale, emptyArgs())
If Not bTest Then
oSpell = oSpeller.spell(sMistake, aLocale, emptyArgs())
sAlternatives = oSpell.getAlternatives()
s = ""
for i = LBound(sAlternatives) To Ubound(sAlternatives) - 1
s = s & sAlternatives(i) & ", "
Next i
s = s & sAlternatives(Ubound(sAlternatives))
oDestinationText.insertString(oDestinationCursor, sMistake & ": " & s & Chr(13), False)
End If
Loop While oSourceCursor.gotoNextWord(False)
End Sub
I don't know about the dictionaries but, in answer to your previous comment, if you paste the following code below Loop While and above End Sub it will result in the text in the newly opened Writer document being sorted without duplicates. It's not very elegant but it works on the text I've tried it on.
oDestinationCursor.gotoStart(False)
oDestinationCursor.gotoEnd(True)
Dim oSortDescriptor As Object
oSortDescriptor = oDestinationCursor.createSortDescriptor()
oDestinationCursor.sort(oSortDescriptor)
Dim sParagraphToBeChecked As String
Dim sThisWord As String
sThisWord = ""
Dim sPreviousWord As String
sPreviousWord = ""
oDestinationCursor.gotoStart(False)
oDestinationCursor.collapseToStart()
Dim k As Integer
Do
oDestinationCursor.gotoEndOfParagraph(True)
sParagraphToBeChecked = oDestinationCursor.getString()
k = InStr(sParagraphToBeChecked, ":")
If k <> 0 Then
sThisWord = Left(sParagraphToBeChecked, k-1)
End If
If StrComp(sThisWord, sPreviousWord, 0) = 0 Then
oDestinationCursor.setString("")
End If
sPreviousWord = sThisWord
Loop While oDestinationCursor.gotoNextParagraph(False)
Dim oReplaceDescriptor As Object
oReplaceDescriptor = oDestination.createReplaceDescriptor()
oReplaceDescriptor.setPropertyValue("SearchRegularExpression", TRUE)
oReplaceDescriptor.setSearchString("^$")
oReplaceDescriptor.setReplaceString("")
oDestination.replaceAll(oReplaceDescriptor)
It seems I didn't spot that because the text I tested it on contained only words that were either correct or had more than zero alternatives. I managed to replicate the error by putting in a word consisting of random characters for which the spellchecker was unable to suggest any alternatives. If no alternatives are found the function .getAlternatives() returns an array of size -1 so the error can be avoided by testing for this condition before the array is used. Below is a modified version of the first Do loop in the subroutine with such a condition added. If you replace the existing loop with that it should eliminate the error.
Do
oSourceCursor.gotoEndOfWord(True)
sMistake = oSourceCursor.getString()
bTest = oSpeller.isValid(sMistake, aLocale, emptyArgs())
If Not bTest Then
oSpell = oSpeller.spell(sMistake, aLocale, emptyArgs())
sAlternatives = oSpell.getAlternatives()
s = ""
If Ubound(sAlternatives) >= 0 Then
for i = LBound(sAlternatives) To Ubound(sAlternatives) - 1
s = s & sAlternatives(i) & ", "
Next i
s = s & sAlternatives(Ubound(sAlternatives))
End If
oDestinationText.insertString(oDestinationCursor, sMistake & ": " & s & Chr(13), False)
End If
Loop While oSourceCursor.gotoNextWord(False)
On re-reading the whole subroutine I think it would improve its readability if the variable sMistake were renamed to something like sWordToBeChecked, as the string this variable contains isn't always misspelled. This would of course need to be changed everywhere in the routine and not just in the above snippet.
Below is a modified version that uses the dispatcher as suggested by Jim K in his answer go to end of word is not always followed. I have written it out in its entirety because the changes are more extensive than just adding or replacing a block. In particular, it is necessary to get the view cursor before creating the empty destination document, otherwise the routine will spell check that.
Sub ListMisSpelledWords2
' From OOME Listing 315 Page 336
GlobalScope.BasicLibraries.loadLibrary( "Tools" )
Dim sLocale As String
sLocale = GetRegistryKeyContent("org.openoffice.Setup/L10N", FALSE).getByName("ooLocale")
' ooLocale appears to return a string that consists of the language and country
' seperated by a dash, e.g. en-GB
Dim nDash As Integer
nDash = InStr(sLocale, "-")
Dim aLocale As New com.sun.star.lang.Locale
aLocale.Language = Left(sLocale, nDash - 1)
aLocale.Country = Right(sLocale, Len(sLocale) -nDash )
Dim oSourceDocument As Object
oSourceDocument = ThisComponent
Dim nWordCount as Integer
nWordCount = oSourceDocument.WordCount
Dim oFrame As Object, oViewCursor As Object
With oSourceDocument.getCurrentController
oFrame = .getFrame()
oViewCursor = .getViewCursor()
End With
Dim oDispatcher as Object
oDispatcher = createUnoService("com.sun.star.frame.DispatchHelper")
oDispatcher.executeDispatch(oFrame, ".uno:GoToStartOfDoc", "", 0, Array())
Dim oDestinationDocument As Object
oDestinationDocument = StarDesktop.loadComponentFromURL( "private:factory/swriter", "_blank", 0, Array() )
Dim oDestinationText as Object
oDestinationText = oDestinationDocument.getText()
Dim oDestinationCursor As Object
oDestinationCursor = oDestinationText.createTextCursor()
Dim oSpeller As Object
oSpeller = createUnoService("com.sun.star.linguistic2.SpellChecker")
Dim oSpellAlternatives As Object, emptyArgs() as new com.sun.star.beans.PropertyValue
Dim sMistake as String, oSpell As Object, sAlternatives() as String, bTest As Boolean, s As String, i as Integer
For i = 0 To nWordCount - 1
oDispatcher.executeDispatch(oFrame, ".uno:WordRightSel", "", 0, Array())
sWordToBeChecked = RTrim( oViewCursor.String )
bTest = oSpeller.isValid(sWordToBeChecked, aLocale, emptyArgs())
If Not bTest Then
oSpell = oSpeller.spell(sWordToBeChecked, aLocale, emptyArgs())
sAlternatives = oSpell.getAlternatives()
s = ""
If Ubound(sAlternatives) >= 0 Then
for i = LBound(sAlternatives) To Ubound(sAlternatives) - 1
s = s & sAlternatives(i) & ", "
Next i
s = s & sAlternatives(Ubound(sAlternatives))
End If
oDestinationText.insertString(oDestinationCursor, sWordToBeChecked & ": " & s & Chr(13), False)
End If
oDispatcher.executeDispatch(oFrame, ".uno:GoToPrevWord", "", 0, Array())
oDispatcher.executeDispatch(oFrame, ".uno:GoToNextWord", "", 0, Array())
Next i
oDestinationCursor.gotoStart(False)
oDestinationCursor.gotoEnd(True)
' Sort the paragraphs
Dim oSortDescriptor As Object
oSortDescriptor = oDestinationCursor.createSortDescriptor()
oDestinationCursor.sort(oSortDescriptor)
' Remove duplicates
Dim sParagraphToBeChecked As String, sThisWord As String, sPreviousWord As String
sThisWord = ""
sPreviousWord = ""
oDestinationCursor.gotoStart(False)
oDestinationCursor.collapseToStart()
Dim k As Integer
Do
oDestinationCursor.gotoEndOfParagraph(True)
sParagraphToBeChecked = oDestinationCursor.getString()
k = InStr(sParagraphToBeChecked, ":")
If k <> 0 Then
sThisWord = Left(sParagraphToBeChecked, k-1)
End If
If StrComp(sThisWord, sPreviousWord, 0) = 0 Then
oDestinationCursor.setString("")
End If
sPreviousWord = sThisWord
Loop While oDestinationCursor.gotoNextParagraph(False)
' Remove empty paragraphs
Dim oReplaceDescriptor As Object
oReplaceDescriptor = oDestinationDocument.createReplaceDescriptor()
oReplaceDescriptor.setPropertyValue("SearchRegularExpression", TRUE)
oReplaceDescriptor.setSearchString("^$")
oReplaceDescriptor.setReplaceString("")
oDestinationDocument.replaceAll(oReplaceDescriptor)
End Sub
It looks like the problem is caused by one of the while loops in the function TrimWord, which removes punctuation from before and after a word before it is fed to the spell-check service. If the word is only one character long and if it is a valid punctuation character then the condition at the beginning of the loop is true, so the loop is entered and the counter n is decremented to zero. Then at the beginning of the next traversal of the loop, even though the condition is false anyway, it still asks the Mid function to return the 0th character of the word, which it can't do because the characters are numbered from 1, so it throws an error. Some languages would ignore the error if the truth value of the condition could be unambiguously determined from the other parts of the expression. It looks like BASIC doesn't do that.
The following modified version of the function gets round the problem in a rather inelegant way, but it seems to work:
Function TrimWord(sWord As String) As String
Dim n as Long
n = Len(sWord)
If n > 0 Then
Dim m as Long : m = 1
Dim bTest As Boolean
bTest = m <= n
Do While IsPermissiblePrefix( ASC(Mid(sWord, m, 1) ) ) And bTest
if (m < n) Then
m = m + 1
Else
bTest = False
End If
Loop
bTest = n > 0
Do While IsPermissibleSuffix( ASC(Mid(sWord, n, 1) ) ) And bTest
if (n > 1) Then
n = n - 1
Else
bTest = False
End If
Loop
If n > m Then
TrimWord = Mid(sWord, m, (n + 1) - m)
Else
TrimWord = sWord
End If
Else
TrimWord = ""
End If
End Function
This works for me.
Firstly, in response to your question about the bug, I'm not a maintainer so I can't fix that. However, as the bug concerns moving a text cursor to the start and end of a word it should be possible to get round it by searching for the white-space between words instead. Since the white-space characters are (I think) the same in all languages, any problems recognising certain characters from certain alphabets shouldn't matter. The easiest way to do it would be to first read the entire text of the document into a string but LibreOffice strings have a maximum length of 2^16 = 65536 characters and while this seems like a lot it could easily be too small for a reasonable sized document. The limit can be avoided by navigating through the text one paragraph at a time. According to Andrew Pitonyak (OOME Page 388): "I found gotoNextSentence() and gotoNextWord() to be unreliable, but the paragraph cursor worked well."
The code below is yet another modification of the subroutines in previous answers. This time it gets a string from a paragraph and splits it up into words by finding the white-space between the words. It then spell checks the words as before. The subroutine depends on some other functions that are listed below it. These allow you to specify which characters to designate as word separators (i.e. white-space) and which characters to ignore if they are found at the beginning or end of a word. This is necessary so that, for example, the quotes surrounding a quoted word are not counted as part of the word, which would lead to it being recognised as a spelling mistake even if the word inside the quotes is correctly spelled.
I am not familiar with non-latin alphabets and I don't have an appropriate dictionary installed, but I pasted the words from your question go to end of word is not always followed, namely testी, भारत and इंडिया and they all appeared unmodified in the output document.
On the question of looking up synonyms, as each misspelled word has multiple suggestions, and each of those will have multiple synonyms, the output could rapidly become very large and confusing. It may be better for your user to look them up individually if they want to use a different word.
Sub ListMisSpelledWords3
' From OOME Listing 315 Page 336
GlobalScope.BasicLibraries.loadLibrary( "Tools" )
Dim sLocale As String
sLocale = GetRegistryKeyContent("org.openoffice.Setup/L10N", FALSE).getByName("ooLocale")
' ooLocale appears to return a string that consists of the language and country
' seperated by a dash, e.g. en-GB
Dim nDash As Integer
nDash = InStr(sLocale, "-")
Dim aLocale As New com.sun.star.lang.Locale
aLocale.Language = Left( sLocale, nDash - 1)
aLocale.Country = Right( sLocale, Len(sLocale) - nDash )
Dim oSource As Object
oSource = ThisComponent
Dim oSourceCursor As Object
oSourceCursor = oSource.getText.createTextCursor()
oSourceCursor.gotoStart(False)
oSourceCursor.collapseToStart()
Dim oDestination As Object
oDestination = StarDesktop.loadComponentFromURL( "private:factory/swriter", "_blank", 0, Array() )
Dim oDestinationText as Object
oDestinationText = oDestination.getText()
Dim oDestinationCursor As Object
oDestinationCursor = oDestinationText.createTextCursor()
Dim oSpeller As Object
oSpeller = createUnoService("com.sun.star.linguistic2.SpellChecker")
Dim oSpellAlternatives As Object, emptyArgs() as new com.sun.star.beans.PropertyValue
Dim sWordToCheck as String, oSpell As Object, sAlternatives() as String, bTest As Boolean
Dim s As String, i as Integer, j As Integer, sParagraph As String, nWordStart As Integer, nWordEnd As Integer
Dim nChar As Integer
Do
oSourceCursor.gotoEndOfParagraph(True)
sParagraph = oSourceCursor.getString() & " " 'It is necessary to add a space to the end of
'the string otherwise the last word of the paragraph is not recognised.
nWordStart = 1
nWordEnd = 1
For i = 1 to Len(sParagraph)
nChar = ASC(Mid(sParagraph, i, 1))
If IsWordSeparator(nChar) Then '1
If nWordEnd > nWordStart Then '2
sWordToCheck = TrimWord( Mid(sParagraph, nWordStart, nWordEnd - nWordStart) )
bTest = oSpeller.isValid(sWordToCheck, aLocale, emptyArgs())
If Not bTest Then '3
oSpell = oSpeller.spell(sWordToCheck, aLocale, emptyArgs())
sAlternatives = oSpell.getAlternatives()
s = ""
If Ubound(sAlternatives) >= 0 Then '4
for j = LBound(sAlternatives) To Ubound(sAlternatives) - 1
s = s & sAlternatives(j) & ", "
Next j
s = s & sAlternatives(Ubound(sAlternatives))
End If '4
oDestinationText.insertString(oDestinationCursor, sWordToCheck & " : " & s & Chr(13), False)
End If '3
End If '2
nWordEnd = nWordEnd + 1
nWordStart = nWordEnd
Else
nWordEnd = nWordEnd + 1
End If '1
Next i
Loop While oSourceCursor.gotoNextParagraph(False)
oDestinationCursor.gotoStart(False)
oDestinationCursor.gotoEnd(True)
Dim oSortDescriptor As Object
oSortDescriptor = oDestinationCursor.createSortDescriptor()
oDestinationCursor.sort(oSortDescriptor)
Dim sParagraphToBeChecked As String
Dim sThisWord As String
sThisWord = ""
Dim sPreviousWord As String
sPreviousWord = ""
oDestinationCursor.gotoStart(False)
oDestinationCursor.collapseToStart()
Dim k As Integer
Do
oDestinationCursor.gotoEndOfParagraph(True)
sParagraphToBeChecked = oDestinationCursor.getString()
k = InStr(sParagraphToBeChecked, ":")
If k <> 0 Then
sThisWord = Left(sParagraphToBeChecked, k-1)
End If
If StrComp(sThisWord, sPreviousWord, 0) = 0 Then
oDestinationCursor.setString("")
End If
sPreviousWord = sThisWord
Loop While oDestinationCursor.gotoNextParagraph(False)
Dim oReplaceDescriptor As Object
oReplaceDescriptor = oDestination.createReplaceDescriptor()
oReplaceDescriptor.setPropertyValue("SearchRegularExpression", TRUE)
oReplaceDescriptor.setSearchString("^$")
oReplaceDescriptor.setReplaceString("")
oDestination.replaceAll(oReplaceDescriptor)
End Sub
'----------------------------------------------------------------------------
' From OOME Listing 360.
Function IsWordSeparator(iChar As Integer) As Boolean
' Horizontal tab \t 9
' New line \n 10
' Carriage return \r 13
' Space 32
' Non-breaking space 160
Select Case iChar
Case 9, 10, 13, 32, 160
IsWordSeparator = True
Case Else
IsWordSeparator = False
End Select
End Function
'-------------------------------------
' Characters to be trimmed off beginning of word before spell checking
Function IsPermissiblePrefix(iChar As Integer) As Boolean
' Symmetric double quote " 34
' Left parenthesis ( 40
' Left square bracket [ 91
' Back-tick ` 96
' Left curly bracket { 123
' Left double angle quotation marks « 171
' Left single quotation mark ‘ 8216
' Left single reversed 9 quotation mark ‛ 8219
' Left double quotation mark “ 8220
' Left double reversed 9 quotation mark ‟ 8223
Select Case iChar
Case 34, 40, 91, 96, 123, 171, 8216, 8219, 8220, 8223
IsPermissiblePrefix = True
Case Else
IsPermissiblePrefix = False
End Select
End Function
'-------------------------------------
' Characters to be trimmed off end of word before spell checking
Function IsPermissibleSuffix(iChar As Integer) As Boolean
' Exclamation mark ! 33
' Symmetric double quote " 34
' Apostrophe ' 39
' Right parenthesis ) 41
' Comma , 44
' Full stop . 46
' Colon : 58
' Semicolon ; 59
' Question mark ? 63
' Right square bracket ] 93
' Right curly bracket } 125
' Right double angle quotation marks » 187
' Right single quotation mark ‘ 8217
' Right double quotation mark “ 8221
Select Case iChar
Case 33, 34, 39, 41, 44, 46, 58, 59, 63, 93, 125, 187, 8217, 8221
IsPermissibleSuffix = True
Case Else
IsPermissibleSuffix = False
End Select
End Function
'-------------------------------------
Function TrimWord( sWord As String) As String
Dim n as Integer
n = Len(sWord)
If n > 0 Then
Dim m as Integer : m = 1
Do While IsPermissiblePrefix( ASC(Mid(sWord, m, 1) ) ) And m <= n
m = m + 1
Loop
Do While IsPermissibleSuffix( ASC(Mid(sWord, n, 1) ) ) And n >= 1
n = n - 1
Loop
If n > m Then
TrimWord = Mid(sWord, m, (n + 1) - m)
Else
TrimWord = sWord
End If
Else
TrimWord = ""
End If
End Function

VB6 Hashing SHA1 output not matched

need help for my problem here. i do searching and googling for this problem but still don't found the solution why my output didnt matched with the expected output.
data to hash :
0800210142216688003333311100000554478000000
expected output :
DAAC526D4806C88CEDB8B7C6EA42A7442DE6E7DC
my output :
805C790E6BF39E3482067C44909EE126F9CBB878
and i am using this function to generate the hash
Public Function HashString(ByVal Str As String, Optional ByVal Algorithm As HashAlgorithm = SHA1) As String
On Error Resume Next
Dim hCtx As Long
Dim hHash As Long
Dim lRes As Long
Dim lLen As Long
Dim lIdx As Long
Dim AbData() As Byte
lRes = CryptAcquireContext(hCtx, vbNullString, vbNullString, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT)
If lRes <> 0 Then
lRes = CryptCreateHash(hCtx, Algorithm, 0, 0, hHash)
If lRes <> 0 Then
lRes = CryptHashData(hHash, ByVal Str, Len(Str), 0)
If lRes <> 0 Then
lRes = CryptGetHashParam(hHash, HP_HASHSIZE, lLen, 4, 0)
If lRes <> 0 Then
ReDim AbData(0 To lLen - 1)
lRes = CryptGetHashParam(hHash, HP_HASHVAL, AbData(0), lLen, 0)
If lRes <> 0 Then
For lIdx = 0 To UBound(AbData)
HashString = HashString & Right$("0" & Hex$(AbData(lIdx)), 2)
Next
End If
End If
End If
CryptDestroyHash hHash
End If
End If
CryptReleaseContext hCtx, 0
If lRes = 0 Then
MsgBox Err.LastDllError
End If
End Function
and this is command to call the function
Dim received As String
Dim HASH As String
HASH = "0800210142216688003333311100000554478000000"
received = HashString(HASH)
Debug.Print ("HASH VALUE : " & received)
thanks
UPDATE:
finally i managed to get the expected output. i change the function to generate the sha1 using the sha1 function in this website :
http://vb.wikia.com/wiki/SHA-1.bas
and i do use this function to convert my hexstring to byte array
Public Function HexStringToByteArray(ByRef HexString As String) As Byte()
Dim bytOut() As Byte, bytHigh As Byte, bytLow As Byte, lngA As Long
If LenB(HexString) Then
' preserve memory for output buffer
ReDim bytOut(Len(HexString) \ 2 - 1)
' jump by every two characters (in this case we happen to use byte positions for greater speed)
For lngA = 1 To LenB(HexString) Step 4
' get the character value and decrease by 48
bytHigh = AscW(MidB$(HexString, lngA, 2)) - 48
bytLow = AscW(MidB$(HexString, lngA + 2, 2)) - 48
' move old A - F values down even more
If bytHigh > 9 Then bytHigh = bytHigh - 7
If bytLow > 9 Then bytLow = bytLow - 7
' I guess the C equivalent of this could be like: *bytOut[++i] = (bytHigh << 8) || bytLow
bytOut(lngA \ 4) = (bytHigh * &H10) Or bytLow
Next lngA
' return the output
HexStringToByteArray = bytOut
End If
End Function
and i using this command to get the expected output
Dim received As String
Dim HASH As String
Dim intVal As Integer
Dim temp() As Byte
HASH = "08002101422166880033333111000005544780000000"
temp = HexStringToByteArray(HASH)
received = Replace(HexDefaultSHA1(temp), " ", "")
Debug.Print ("HASH VALUE : " & received)
and finally i got the same output as expected. Yeah!!..
805c... is the SHA1 hash of the characters in your input string, i.e. '0', '8', '0', '0', ...
daac... is the SHA1 hash of the characters in your input string after conversion of each pair of hexadecimal digits to a byte, i.e. 0x08, 0x00, ...
Convert the input string to an array of bytes prior to hashing.
Your output is correct. This is SHA1 using python:
>>> import hashlib
>>> s = hashlib.sha1('0800210142216688003333311100000554478000000')
>>> s.hexdigest()
'805c790e6bf39e3482067c44909ee126f9cbb878'
Where did you get the other SHA1 computation from?

Encoding of Text Files in VB 6.0

I have huge external files with the "ANSI" and "UCS-2 Little Endian" encoding formats.
Now I want to change the file encoding format into UTF-8 using Visual Basic 6.0. I don't want to modify the original file; I just want to change the encoding format alone.
I have searched on the web; but can't understand the VB code, and have no idea how to do it.
I would also like to be able to read lines one at a time from UTF-8 encoded files.
NOTE. This answer has been extensively expanded to fit in with the edited question, which in turn was due to Visual Basic 6 and UTF-8
The following code wraps up converting ANSI, UTF-16 and UTF-32 encoded strings from a file to UTF-8 strings, in VB6. You have to load in the entire file and output it. Note that if it was truly generic, the LineInputUTF8() method would be LineInput(), and require a code page.
Option Explicit
Private Declare Function MultiByteToWideChar Lib "Kernel32.dll" ( _
ByVal CodePage As Long, _
ByVal dwFlags As Long, _
ByVal lpMultiByteStr As Long, _
ByVal cbMultiByte As Long, _
ByVal lpWideCharStr As Long, _
ByVal cchWideChar As Long _
) As Long
Private Declare Function WideCharToMultiByte Lib "Kernel32.dll" ( _
ByVal CodePage As Long, _
ByVal dwFlags As Long, _
ByVal lpWideCharStr As Long, _
ByVal cchWideChar As Long, _
ByVal lpMultiByteStr As Long, _
ByVal cbMultiByte As Long, _
ByVal lpDefaultChar As Long, _
ByVal lpUsedDefaultChar As Long _
) As Long
Public Const CP_ACP As Long = 0 ' Default ANSI code page.
Public Const CP_UTF8 As Long = 65001 ' UTF8.
Public Const CP_UTF16_LE As Long = 1200 ' UTF16 - little endian.
Public Const CP_UTF16_BE As Long = 1201 ' UTF16 - big endian.
Public Const CP_UTF32_LE As Long = 12000 ' UTF32 - little endian.
Public Const CP_UTF32_BE As Long = 12001 ' UTF32 - big endian.
' Purpose: Heuristic to determine whether bytes in a file are UTF-8.
Private Function FileBytesAreUTF8(ByVal the_iFileNo As Integer) As Boolean
Const knSampleByteSize As Long = 2048
Dim nLof As Long
Dim nByteCount As Long
Dim nByteIndex As Long
Dim nCharExtraByteCount As Long
Dim bytValue As Byte
' We look at the first <knSampleByteSize> bytes of the file. However, if the file is smaller, we will have to
' use the smaller size.
nLof = LOF(the_iFileNo)
If nLof < knSampleByteSize Then
nByteCount = nLof
Else
nByteCount = knSampleByteSize
End If
' Go to the start of the file.
Seek #the_iFileNo, 1
For nByteIndex = 1 To nByteCount
Get #the_iFileNo, , bytValue
' If the character we are processing has bytes beyond 1, then we are onto the next character.
If nCharExtraByteCount = 0 Then
'
' The UTF-8 specification says that the first byte of a character has masking bits which indicate how many bytes follow.
'
' See: http://en.wikipedia.org/wiki/UTF-8#Description
'
' Bytes in
' sequence Byte 1 Byte 2 Byte 3 Byte 4
' 1 0xxxxxxx
' 2 110xxxxx 10xxxxxx
' 3 1110xxxx 10xxxxxx 10xxxxxx
' 4 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
'
If (bytValue And &H80) = &H0 Then
nCharExtraByteCount = 0
ElseIf (bytValue And &HE0) = &HC0 Then
nCharExtraByteCount = 1
ElseIf (bytValue And &HF0) = &HE0 Then
nCharExtraByteCount = 2
ElseIf (bytValue And &HF8) = &HF0 Then
nCharExtraByteCount = 3
Else
' If none of these masks were matched, then this can't be a UTF-8 character.
FileBytesAreUTF8 = False
Exit Function
End If
Else
' All following bytes must be masked as in the table above.
If (bytValue And &HC0) = &H80 Then
nCharExtraByteCount = nCharExtraByteCount - 1
If nCharExtraByteCount = 0 Then
FileBytesAreUTF8 = True
End If
Else
' Not a UTF8 character.
FileBytesAreUTF8 = False
Exit Function
End If
End If
Next nByteIndex
End Function
' Purpose: Take a string whose bytes are in the byte array <the_abytCPString>, with code page <the_nCodePage>, convert to a VB string.
Private Function FromCPString(ByRef the_abytCPString() As Byte, ByVal the_nCodePage As Long) As String
Dim sOutput As String
Dim nValueLen As Long
Dim nOutputCharLen As Long
' If the code page says this is already compatible with the VB string, then just copy it into the string. No messing.
If the_nCodePage = CP_UTF16_LE Then
FromCPString = the_abytCPString()
Else
' Cache the input length.
nValueLen = UBound(the_abytCPString) - LBound(the_abytCPString) + 1
' See how big the output buffer will be.
nOutputCharLen = MultiByteToWideChar(the_nCodePage, 0&, VarPtr(the_abytCPString(LBound(the_abytCPString))), nValueLen, 0&, 0&)
' Resize output byte array to the size of the UTF-8 string.
sOutput = Space$(nOutputCharLen)
' Make this API call again, this time giving a pointer to the output byte array.
MultiByteToWideChar the_nCodePage, 0&, VarPtr(the_abytCPString(LBound(the_abytCPString))), nValueLen, StrPtr(sOutput), nOutputCharLen
' Return the array.
FromCPString = sOutput
End If
End Function
Public Function GetContents(ByVal the_sTextFile As String, ByRef out_nCodePage As Long, Optional ByVal the_nDesiredCodePage As Long = -1, Optional ByRef out_bContainedBOM As Boolean) As String
Dim iFileNo As Integer
Dim abytFileContents() As Byte
Dim nDataSize As Long
iFileNo = FreeFile
OpenForInput the_sTextFile, iFileNo, out_nCodePage, the_nDesiredCodePage, out_bContainedBOM
' We want to read the entire contents of the file (not including any BOM value).
' After calling OpenForInput(), the file pointer should be positioned after any BOM.
' So size file contents buffer to <file size> - <current position> + 1.
nDataSize = LOF(iFileNo) - Seek(iFileNo) + 1
ReDim abytFileContents(1 To nDataSize)
Get #iFileNo, , abytFileContents()
Close iFileNo
' Now we must convert this to UTF-8. But we have to first convert to the Windows NT standard UTF-16 LE.
GetContents = FromCPString(abytFileContents(), out_nCodePage)
End Function
' Purpose: Reads up to the end of the current line of the file, repositions to the beginning of the next line, if any, and
' outputs all characters found.
' Inputs: the_nFileNo The number of the file.
' Outputs: out_sLine The line from the current position in the file.
' Return: True if there is more data.
Public Function LineInputUTF8(ByVal the_nFileNo As Integer, ByRef out_sLine As String) As Boolean
Dim bytValue As Byte
Dim abytLine() As Byte
Dim nStartOfLinePos As Long
Dim nEndOfLinePos As Long
Dim nStartOfNextLine As Long
Dim nLineLen As Long
' Save the current file position as the beginning of the line, and cache this value.
nStartOfLinePos = Seek(the_nFileNo)
' Retrieves the first byte from the current position.
Get #the_nFileNo, , bytValue
' Loop until the end of file is encountered.
Do Until EOF(the_nFileNo)
' Check whether this byte represents a carriage return or line feed character (indicating new line).
If bytValue = 13 Or bytValue = 10 Then
' By this point, the current position is *after* the CR or LF character, so to get the position of the
' last byte in the line, we must go back two bytes.
nEndOfLinePos = Seek(the_nFileNo) - 2
' If this is a carriage return, then we must check the next character.
If bytValue = 13 Then
Get #the_nFileNo, , bytValue
' Is this a line feed?
If bytValue = 10 Then
' Yes. Assume that CR-LF counts as a single NewLine. So the start of the next line should skip over the line feed.
nStartOfNextLine = nEndOfLinePos + 3
Else
' No. The start of the next line is the current position.
nStartOfNextLine = nEndOfLinePos + 2
End If
ElseIf bytValue = 10 Then
' If this is a line feed, then the start of the next line is the current position.
nStartOfNextLine = nEndOfLinePos + 2
End If
' Since we have processed all the bytes in the line, exit the loop.
Exit Do
End If
' Get the next byte.
Get #the_nFileNo, , bytValue
Loop
' Check to see if there was an end of line.
If nEndOfLinePos = 0 Then
' No, this is the end of the file - so use all the remaining characters.
nLineLen = Seek(the_nFileNo) - nStartOfLinePos - 1
Else
' Yes - so use all the characters up to the end of line position.
nLineLen = nEndOfLinePos - nStartOfLinePos + 1
End If
' Is this line empty?
If nLineLen = 0 Then
' Yes - just return an empty string.
out_sLine = vbNullString
Else
' No - pull all the bytes from the beginning to the end of the line into a byte array, and then convert that from UTF-8 to a VB string.
ReDim abytLine(1 To nLineLen)
Get #the_nFileNo, nStartOfLinePos, abytLine()
out_sLine = FromCPString(abytLine(), CP_UTF8)
End If
' If there is a line afterwards, then move to the beginning of the line, and return True.
If nStartOfNextLine > 0 Then
Seek #the_nFileNo, nStartOfNextLine
LineInputUTF8 = True
End If
End Function
' Purpose: Analogue of 'Open "fileName" For Input As #fileNo' - but also return what type of text this is via a Code Page value.
' Inputs: the_sFileName
' the_iFileNo
' (the_nDesiredCodePage) The code page that you want to use with this file.
' If this value is set to the default, -1, this indicates that the code page will be ascertained from the file.
' Outputs: out_nCodePage There are only six valid values that are returned if <the_nDesiredCodePage> was set to -1.
' CP_ACP ANSI code page
' CP_UTF8 UTF-8
' CP_UTF16LE UTF-16 Little Endian (VB and NT default string encoding)
' CP_UTF16BE UTF-16 Big Endian
' CP_UTF32LE UTF-32 Little Endian
' CP_UTF32BE UTF-32 Big Endian
' (out_bContainedBOM) If this was set to True, then the file started with a BOM (Byte Order Marker).
Public Sub OpenForInput(ByRef the_sFilename As String, ByVal the_iFileNo As Integer, ByRef out_nCodePage As Long, Optional ByVal the_nDesiredCodePage As Long = -1, Optional ByRef out_bContainedBOM As Boolean)
' Note if we want to take account of every case, we should read in the first four bytes, and check for UTF-32 low and high endian BOMs, check
' the first three bytes for the UTF-8 BOM, and finally check the first two bytes for UTF-16 low and hight endian BOMs.
Dim abytBOM(1 To 4) As Byte
Dim nCodePage As Long
' By default, there is no BOM.
out_bContainedBOM = False
Open the_sFilename For Binary Access Read As #the_iFileNo
' We are interested in -1 (ascertain code page), and then various UTF encodings.
Select Case the_nDesiredCodePage
Case -1, CP_UTF8, CP_UTF16_BE, CP_UTF16_LE, CP_UTF32_BE, CP_UTF32_LE
' Default code page.
nCodePage = CP_ACP
' Pull in the first four bytes to determine the BOM (byte order marker).
Get #the_iFileNo, , abytBOM()
' The following are the BOMs for text files:
'
' FF FE UTF-16, little endian
' FE FF UTF-16, big endian
' EF BB BF UTF-8
' FF FE 00 00 UTF-32, little endian
' 00 00 FE FF UTF-32, big-endian
'
' Work out the code page from this information.
Select Case abytBOM(1)
Case &HFF
If abytBOM(2) = &HFE Then
If abytBOM(3) = 0 And abytBOM(4) = 0 Then
nCodePage = CP_UTF32_LE
Else
nCodePage = CP_UTF16_LE
End If
End If
Case &HFE
If abytBOM(2) = &HFF Then
nCodePage = CP_UTF16_BE
End If
Case &HEF
If abytBOM(2) = &HBB And abytBOM(3) = &HBF Then
nCodePage = CP_UTF8
End If
Case &H0
If abytBOM(2) = &H0 And abytBOM(3) = &HFE And abytBOM(4) = &HFF Then
nCodePage = CP_UTF32_BE
End If
End Select
' Did we match any BOMs?
If nCodePage = CP_ACP Then
' No - we are still defaulting to the ANSI code page.
' Special check for UTF-8. The BOM is not specified in the standard for UTF-8, but according to Wikipedia (which is always right :-) ),
' only Microsoft includes this marker at the beginning of files.
If FileBytesAreUTF8(the_iFileNo) Then
out_nCodePage = CP_UTF8
Else
out_nCodePage = CP_ACP
End If
Else
' Yes - we have worked out the code page from the BOM.
' If no code page was suggested, we now return the code page we found.
If the_nDesiredCodePage = -1 Then
out_nCodePage = nCodePage
End If
' Inform the caller that a BOM was found.
out_bContainedBOM = True
End If
' Reset the file pointer to the beginning of the file data.
If out_bContainedBOM Then
' Note that if the code page found was one of the two UTF-32 values, then we are already in the correct position.
' Otherwise, we have to move to just after the end of the BOM.
Select Case nCodePage
Case CP_UTF16_BE, CP_UTF16_LE
Seek #the_iFileNo, 3
Case CP_UTF8
Seek #the_iFileNo, 4
End Select
Else
' There is no BOM, so simply go the beginning of the file.
Seek #the_iFileNo, 1
End If
Case Else
out_nCodePage = the_nDesiredCodePage
End Select
End Sub
' Purpose: Analogue of 'Open "fileName" For Append As #fileNo'
Public Sub OpenForAppend(ByRef the_sFilename As String, ByVal the_iFileNo As Integer, Optional ByVal the_nCodePage As Long = CP_ACP, Optional ByVal the_bPrefixWithBOM As Boolean = True)
' Open the file and move to the end of the file.
Open the_sFilename For Binary Access Write As #the_iFileNo
Seek the_iFileNo, LOF(the_iFileNo) + 1
If the_bPrefixWithBOM Then
WriteBOM the_iFileNo, the_nCodePage
End If
End Sub
' Purpose: Analogue of 'Open "fileName" For Output As #fileNo'
Public Sub OpenForOutput(ByRef the_sFilename As String, ByVal the_iFileNo As Integer, Optional ByVal the_nCodePage As Long = CP_ACP, Optional ByVal the_bPrefixWithBOM As Boolean = True)
' Ensure we overwrite the file by deleting it ...
On Error Resume Next
Kill the_sFilename
On Error GoTo 0
' ... before creating it.
Open the_sFilename For Binary Access Write As #the_iFileNo
If the_bPrefixWithBOM Then
WriteBOM the_iFileNo, the_nCodePage
End If
End Sub
' Purpose: Analogue of the 'Print #fileNo, value' statement. But only one value allowed.
' Setting <the_bAppendNewLine> = False is analagous to 'Print #fileNo, value;'.
Public Sub Print_(ByVal the_iFileNo As Integer, ByRef the_sValue As String, Optional ByVal the_nCodePage As Long = CP_ACP, Optional ByVal the_bAppendNewLine As Boolean = True)
Const kbytNull As Byte = 0
Const kbytCarriageReturn As Byte = 13
Const kbytNewLine As Byte = 10
Put #the_iFileNo, , ToCPString(the_sValue, the_nCodePage)
If the_bAppendNewLine Then
Select Case the_nCodePage
Case CP_UTF16_BE
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytCarriageReturn
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNewLine
Case CP_UTF16_LE
Put #the_iFileNo, , kbytCarriageReturn
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNewLine
Put #the_iFileNo, , kbytNull
Case CP_UTF32_BE
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytCarriageReturn
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNewLine
Case CP_UTF32_LE
Put #the_iFileNo, , kbytCarriageReturn
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNewLine
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNull
Case Else
Put #the_iFileNo, , kbytCarriageReturn
Put #the_iFileNo, , kbytNewLine
End Select
End If
End Sub
Public Sub PutContents(ByRef the_sFilename As String, ByRef the_sFileContents As String, Optional ByVal the_nCodePage As Long = CP_ACP, Optional the_bPrefixWithBOM As Boolean)
Dim iFileNo As Integer
iFileNo = FreeFile
OpenForOutput the_sFilename, iFileNo, the_nCodePage, the_bPrefixWithBOM
Print_ iFileNo, the_sFileContents, the_nCodePage, False
Close iFileNo
End Sub
' Purpose: Converts a VB string (UTF-16) to UTF8 - as a binary array.
Private Function ToCPString(ByRef the_sValue As String, ByVal the_nCodePage As Long) As Byte()
Dim abytOutput() As Byte
Dim nValueLen As Long
Dim nOutputByteLen As Long
If the_nCodePage = CP_UTF16_LE Then
ToCPString = the_sValue
Else
' Cache the input length.
nValueLen = Len(the_sValue)
' See how big the output buffer will be.
nOutputByteLen = WideCharToMultiByte(the_nCodePage, 0&, StrPtr(the_sValue), nValueLen, 0&, 0&, 0&, 0&)
If nOutputByteLen > 0 Then
' Resize output byte array to the size of the UTF-8 string.
ReDim abytOutput(1 To nOutputByteLen)
' Make this API call again, this time giving a pointer to the output byte array.
WideCharToMultiByte the_nCodePage, 0&, StrPtr(the_sValue), nValueLen, VarPtr(abytOutput(1)), nOutputByteLen, 0&, 0&
End If
' Return the array.
ToCPString = abytOutput()
End If
End Function
Private Sub WriteBOM(ByVal the_iFileNo As Integer, ByVal the_nCodePage As Long)
' FF FE UTF-16, little endian
' FE FF UTF-16, big endian
' EF BB BF UTF-8
' FF FE 00 00 UTF-32, little endian
' 00 00 FE FF UTF-32, big-endian
Select Case the_nCodePage
Case CP_UTF8
Put #the_iFileNo, , CByte(&HEF)
Put #the_iFileNo, , CByte(&HBB)
Put #the_iFileNo, , CByte(&HBF)
Case CP_UTF16_LE
Put #the_iFileNo, , CByte(&HFF)
Put #the_iFileNo, , CByte(&HFE)
Case CP_UTF16_BE
Put #the_iFileNo, , CByte(&HFE)
Put #the_iFileNo, , CByte(&HFF)
Case CP_UTF32_LE
Put #the_iFileNo, , CByte(&HFF)
Put #the_iFileNo, , CByte(&HFE)
Put #the_iFileNo, , CByte(&H0)
Put #the_iFileNo, , CByte(&H0)
Case CP_UTF32_BE
Put #the_iFileNo, , CByte(&H0)
Put #the_iFileNo, , CByte(&H0)
Put #the_iFileNo, , CByte(&HFE)
Put #the_iFileNo, , CByte(&HFF)
End Select
End Sub
The following code was added to a Form which had a VSFlexGrid control with Lucida Console font - purely to provide a way to display as many characters as possible:
Option Explicit
Private Sub Command_Click()
Example_ConvertFileToUTF8
End Sub
Private Sub Command2_Click()
Example_IterateUTF8Lines
End Sub
Private Sub Command3_Click()
Example_ReadWriteUTF8Lines
End Sub
Private Sub Form_Load()
VSFlexGrid.ColWidth(0) = 7000!
End Sub
' Purpose: Converts *any* pure text file (UTF16, ASCII, ANSI) to UTF8.
Private Sub Example_ConvertFileToUTF8()
Dim nCodePage As Long
Dim bContainedBOM As Boolean
Dim sFileContents As String
' Read in contents.
sFileContents = TextFile.GetContents("C:\MysteryEncoding.txt", nCodePage, , bContainedBOM)
' And then convert to UTF8.
TextFile.PutContents "C:\output.txt", sFileContents, CP_UTF8, bContainedBOM
End Sub
' Purpose: Iterates through each line of a UTF-8 text file, and adds it to a control which can display VB strings containing non-ANSI characters.
' In this case, I am adding items to a FlexGrid with Font = "Lucida Console".
Private Sub Example_IterateUTF8Lines()
Dim iFileNo As Integer
Dim lCodePage As Long
Dim sLine As String
iFileNo = FreeFile
TextFile.OpenForInput "C:\UTF8.txt", iFileNo, lCodePage
If lCodePage = CP_UTF8 Then
Do While TextFile.LineInputUTF8(iFileNo, sLine)
VSFlexGrid.AddItem sLine
Loop
VSFlexGrid.AddItem sLine
Else
MsgBox "This is not a UTF8 file."
End If
Close #iFileNo
End Sub
Private Sub Example_ReadWriteUTF8Lines()
Dim iFileNoInput As Integer
Dim iFileNoOutput As Integer
Dim lCodePage As Long
Dim bBOM As Boolean
Dim sLine As String
iFileNoInput = FreeFile
TextFile.OpenForInput "C:\UTF8.txt", iFileNoInput, lCodePage, , bBOM
If lCodePage = CP_UTF8 Then
iFileNoOutput = FreeFile
TextFile.OpenForOutput "C:\output.txt", iFileNoOutput, lCodePage, bBOM
Do While TextFile.LineInputUTF8(iFileNoInput, sLine)
TextFile.Print_ iFileNoOutput, sLine, lCodePage
Loop
TextFile.Print_ iFileNoOutput, sLine, lCodePage, False
Close #iFileNoOutput
Else
MsgBox "This is not a UTF8 file."
End If
Close #iFileNoInput
End Sub
I have to some vb6 code to change file encoding ANSI to Encoding UTF-8.
'--- start function for convert to UTF-8
Private Function OpenAppendUTF8(ByVal FileName As String) As Integer
OpenAppendUTF8 = FreeFile(0)
Open FileName For Binary Access Write As #OpenAppendUTF8
Seek #OpenAppendUTF8, LOF(OpenAppendUTF8) + 1
End Function
Sub DeleteFile(ByVal FileToDelete As String)
If Dir$(FileToDelete) = "" Then 'See above
Else
SetAttr FileToDelete, vbNormal
Kill FileToDelete
End If
End Sub
'-
Private Sub WriteUTF8( _
ByVal FNum As Integer, _
ByVal Text As String, _
Optional ByVal NL As Boolean)
Dim lngResult As Long
Dim UTF8() As Byte
If NL Then Text = Text & vbNewLine
lngResult = WideCharToMultiByte(CP_UTF8, 0, StrPtr(Text), Len(Text), _
0, 0, 0, 0)
If lngResult > 0 Then
ReDim UTF8(lngResult - 1)
WideCharToMultiByte CP_UTF8, 0, StrPtr(Text), Len(Text), _
VarPtr(UTF8(0)), lngResult, 0, 0
Put #FNum, , UTF8
End If
End Sub
'------- end function for convert to UTF-8
'>>> coding
Dim ReadString As String
Dim writeString As String
Dim txtNotepad As String
Dim FileNumber As Integer
Dim UTF8 As String
Dim strsql As String
Dim iloop As Integer
FileNumber = FreeFile(1)
'--- if you need to delete file first.
DeleteFile App.Path & "\PRB\fieldRules.txt"
'---
FileNumber = OpenAppendUTF8(App.Path & "\PRB\fieldRules.txt")
iloop = 1
Do While iloop > 0
iloop = InStr(1, txtNotepad, "\n")
' "\n" for end of line
If iloop <= 5 Then
iloop = 0
Else
'writeString = Mid(txtNotepad, 1, iloop - 1)
'Print #FileNumber, writeString
WriteUTF8 FileNumber, Mid(txtNotepad, 1, iloop - 1), True
txtNotepad = Mid(txtNotepad, iloop + 3, 9999)
End If
Loop
Close #FileNumber
<<<< end code

When exporting Word review comments, how do you reference the sentence related to a comment?

I am trying to export a Word document's review comments. I want to export the sentence selection that was commented on followed by the comment.
Screen shot of the image: http://jspeaks.com/mswordcomment.png
I have found code to loop through the document comments, but I cannot figure out how to reference the sentence selection that the comment was related to.
The current logic is:
Sub ExportComments()
Dim s As String
Dim cmt As Word.Comment
Dim doc As Word.Document
For Each cmt In ActiveDocument.Comments
s = s & cmt.Initial & cmt.Index & "," & cmt.Range.Text & vbCr
Next
Set doc = Documents.Add
doc.Range.Text = s
End Sub
I tinkered with Selection.Range, however I cannot determine the proper object or property that contains the referenced sentence.
I would like to produce output like the following (if we use the example in picture above):
Sentence: Here are more sentences that contain interesting facts - Comment: This is an interesting fact.
Sentence: Here are more sentences that contain interesting facts. Here are more sentences that contain interesting facts. - Comment: This is a very interesting fact
I found someone on another site to solve this question.
The key to the solution is: cmt.Scope.FormattedText
Here is the function revised:
Sub ExportComments()
Dim s As String
Dim cmt As Word.Comment
Dim doc As Word.Document
For Each cmt In ActiveDocument.Comments
s = s & "Text: " & cmt.Scope.FormattedText & " -> "
s = s & "Comments: " & cmt.Initial & cmt.Index & ":" & cmt.Range.Text & vbCr
Next
Set doc = Documents.Add
doc.Range.Text = s
End Sub
I have gathered several pieces of code and came to this solution:
Sub CopyCommentsToExcel()
'Create in Word vba
'TODO: set a reference to the Excel object library (Tools --> Reference --> Microsoft Excel 12.0 Object library)
Dim xlApp As Excel.Application
Dim xlWB As Excel.Workbook
Dim i As Integer
Dim HeadingRow As Integer
HeadingRow = 3
Dim cmtRef As Range
Set xlApp = CreateObject("Excel.Application")
xlApp.Visible = True
Set xlWB = xlApp.Workbooks.Add ' create a new workbook
With xlWB.Worksheets(1)
' Create report info
.Cells(1, 1).Formula = "Reviewed document:"
' Create Heading
.Cells(HeadingRow, 1).Formula = "Index"
.Cells(HeadingRow, 2).Formula = "Page"
.Cells(HeadingRow, 3).Formula = "Line"
.Cells(HeadingRow, 4).Formula = "Comment"
.Cells(HeadingRow, 5).Formula = "Reviewer"
.Cells(HeadingRow, 6).Formula = "Date"
For i = 1 To ActiveDocument.Comments.Count
.Cells(2, 1).Formula = ActiveDocument.Comments(i).Parent
.Cells(i + HeadingRow, 1).Formula = ActiveDocument.Comments(i).Index
.Cells(i + HeadingRow, 2).Formula = ActiveDocument.Comments(i).Reference.Information(wdActiveEndAdjustedPageNumber)
.Cells(i + HeadingRow, 3).Formula = ActiveDocument.Comments(i).Reference.Information(wdFirstCharacterLineNumber)
.Cells(i + HeadingRow, 4).Formula = ActiveDocument.Comments(i).Range
.Cells(i + HeadingRow, 5).Formula = ActiveDocument.Comments(i).Initial
.Cells(i + HeadingRow, 6).Formula = Format(ActiveDocument.Comments(i).Date, "dd/MM/yyyy")
' .Cells(i + 1, 3).Formula = ActiveDocument.Comments(i).Parent
' .Cells(i + 1, 3).Formula = ActiveDocument.Comments(i).Application
' .Cells(i + 1, 7).Formula = ActiveDocument.Comments(i).Author
Next i
End With
Set xlWB = Nothing
Set xlApp = Nothing
End Sub
Most valuable help from Microsoft Answers