I am working on some software that cleans up data before sending it into another system. The data comes from all around the world and contains a variety of characters that have to be replaced. For example ‘, : ; #
The system that accepts the parsed data has very strict character set. It allows
the letters A to Z (upper case only)
the numerals 0 to 9
the special characters / -. Space < =
The data arrives in Excel spreadsheets so I have written the following code in a visual basic macro.
fhl_str contains the data to be cleansed
fhl_str = Replace(fhl_str, ",", " ")
fhl_str = Replace(fhl_str, "'", " ")
fhl_str = Replace(fhl_str, ":", " ")
fhl_str = Replace(fhl_str, ";", " ")
fhl_str = ucase(fhl_str)
Now, each time a new unwanted character arrives we have to add a new line of code. e.g. fhl_str = Replace(fhl_str, "#", " ")
My question is
Could I reverse the logic so that the macro looks for A to Z and 0 to 9 and deletes anything else. That way my code would be future proof for new unwanted characters.
Thanks
If you want to replace bad characters with a single space:
Sub KeepOnlyTheGood()
Dim i As Long, L As Long, v As String, CH As String
Dim r As Range
For Each r In Selection
t = ""
v = r.Value
L = Len(v)
For i = 1 To L
CH = Mid(v, i, 1)
If CH Like "[0-9A-Z]" Or CH = "/" Or CH = "-" Or CH = "." Or CH = " " Or CH = "<" Or CH = "=" Then
t = t & CH
Else
t = t & " "
End If
Next i
r.Value = t
Next r
End Sub
Here's some VBA that will do it if you find regex difficult to understand. It uses the ASCII code to determine the only characters to allow. If your scope changes you can modify the ASCII numbers in the Case statement.
Public Function RemoveSpecial(s As String) As String
Dim sResult As String
Dim nIndex As Integer
s = UCase$(s)
For nIndex = 1 To Len(s)
Select Case Asc(Mid$(s, nIndex, 1))
Case 65 To 90, 45 To 57, 32, 60 To 61
sResult = sResult & Mid$(s, nIndex, 1)
Case Else
sResult = sResult & " "
End Select
Next
RemoveSpecial = sResult
End Function
Usage:
Debug.Print RemoveSpecial("TeSt<>=.##")
or something like:
Range("A1") = RemoveSpecial("TeSt<>=.##")
ASCII Codes
Related
I have a huge list of strings where the I am trying to generate a regular expression in an automated way. The strings are pretty simple and I would like to generate regular expressions using a formula or vba code. From the list of strings, here is the following legend:
& - Any UPPERCASE character (A-Z)
# - Any digits (0-9)
_ - Space (/s)
- - Dash
For example, the regular expression generated for the following strings:
Policy Number Policy Digits Regular Expression
####&&###### 12 ^\d{4}[A-Z]{2}\d{6}$
####&_###### 11 ^\d{4}[A-Z]{1}\s\d{6}$
ACPBP&&########## 17 ^[ACPBP]{5}[A-Z]{2}\d{10}$
ACPBA&########## or ACPBA&&########## 16 or 17 ^[ACPBA]{5}[A-Z]{1,2}\d{10}$
########## 10 ^\d{10}$
09############ 14 ^[09]{2}\d{12}$
A&&######, A&&#######, or A&&######## 9, 10 or 11 ^[A]{1}[A-Z]{2}\d{6,8}$
&&&####, &&&#####, or &&&###### 7, 8, or 9 ^[A-Z]{3}\d{4,6}$
09-##########-## 14 ^[09]{2}-\d{10}-\d{2}$
Is there some existing code that is available to generate regular expressions for a huge list of strings? What are some of the hints or tips that I can use to build a regular expression string? Thanks in advance.
There is no existing code, but try this:
Option Explicit
Option Compare Text 'to handle upper and lower case "or"
'Set reference to Microsoft Scripting Runtime
' or use Late Binding if distributing this
Function createRePattern(sPolicyNum As String) As String
Dim dCode As Dictionary, dReg As Dictionary
Dim I As Long, sReg As String, s As String
Dim v, sPN
v = Replace(sPolicyNum, "or", ",")
v = Split(v, ",")
Set dCode = New Dictionary
dCode.Add Key:="&", Item:="[A-Z]"
dCode.Add Key:="#", Item:="\d"
dCode.Add Key:="_", Item:="\s"
For Each sPN In v
sPN = Trim(sPN)
If Not sPN = "" Then
Set dReg = New Dictionary
For I = 1 To Len(sPN)
s = Mid(sPN, I, 1)
If Not dCode.Exists(s) Then dCode.Add s, s
If dReg.Exists(s) Then
dReg(s) = dReg(s) + 1
Else
If dReg.Count = 1 Then
dReg.Add s, 1
s = Mid(sPN, I - 1, 1)
sReg = sReg & dCode(s) & IIf(dReg(s) > 1, "{" & dReg(s) & "}", "")
dReg.Remove s
Else
dReg.Add s, 1
End If
End If
Next I
'Last Entry in Regex
s = Right(sPN, 1)
sReg = sReg & dCode(s) & IIf(dReg(s) > 1, "{" & dReg(s) & "}", "") & "|"
End If
Next sPN
s = Left(sReg, Len(sReg) - 1)
'Non-capturing group added if alternation present
If InStr(s, "|") = 0 Then
sReg = "^" & s & "$"
Else
sReg = "^(?:" & Left(sReg, Len(sReg) - 1) & ")$"
End If
createRePattern = sReg
End Function
Note
As written, there are limitations in that you cannot reference the literal strings:
#, &, _, , or
Generate regex patterns without dictionary
In addition to Ron's valid solution an alternative using no dictionary:
Option Explicit ' declaration head of code module
Function generateRePattern(ByVal s As String) As String
'[0]definitions & declarations
Const Pipe As String = "|"
Dim curSymbol$: curSymbol = "" ' current symbol (start value)
Dim lngth As Long: lngth = Len(s) ' current string length
Dim ii As Long: ii = 0 ' group index (start value)
Dim n As Long ' repetition counter
ReDim tmp(1 To lngth) ' provide for sufficient temp items
'[1](optional) Pipe replacement for "or" and commas
s = Replace(Replace(Replace(s, " or ", Pipe), " ", ""), ",", Pipe)
'[2]analyze string item s
Dim pos As Long ' current character position
For pos = 1 To lngth ' check each character
Dim curChar As String
curChar = Mid(s, pos, 1) ' define current character
If curChar <> curSymbol Then ' start new group
'a) change repetition counter in old group pattern
If ii > 0 Then tmp(ii) = Replace(tmp(ii), "n", n)
'b) increment group counter & get pattern via help function
ii = ii + 1: tmp(ii) = getPattern(curChar) ' << getPattern
'c) start new repetition counter & group symbol
n = 1: curSymbol = curChar
Else
n = n + 1 ' increment current repetition counter
End If
Next pos
'd) change last repetition counter
tmp(ii) = Replace(tmp(ii), "n", n)
ReDim Preserve tmp(1 To ii) '
'[3]return function result
generateRePattern = "^(?:" & Replace(Join(tmp, ""), "{1}", "") & ")$"
End Function
Help function getPattern()
Function getPattern(curChar) As String
'Purpose: return general pattern based on current character
'a) definitions
Const Pipe As String = "|"
Dim symbols: symbols = Split("&|#|_", Pipe)
Dim patterns: patterns = Split("[A-Z]{n}|\d{n}|\s", Pipe)
'b) match character position within symbols
Dim pos: pos = Application.Match(curChar, symbols, 0)
'c) return pattern
If IsError(pos) Then
getPattern = curChar
Else
getPattern = patterns(pos - 1)
End If
End Function
Example
Say I have a string:
"I say ""Hello world"" and she says ""Excuse me?"""
VBA will interpret this string as:
I say "Hello world" and she says "Excuse me?"
A more complex example:
I have a string:
"I say ""Did you know that she said """"Hi there!"""""""
VBA interprets this string as:
I say "Did you know that she said ""Hi there!"""
If we remove "I say "
"Did you know that she said ""Hi there!"""
we can continue parsing the string in vba:
Did you know that she said "Hi there!"
Problem
Ultimately I want some function, sBasicQuote(quotedStringHierarchy as string), which returns a string containing the next level up in the string hierarchy.
E.G.
dim s as string
s = "I say ""Did you know that she said """"Hi there!"""""""
s = sBasicQuote(s) ' returns 'I say "Did you know that she said ""Hi there!"""'
s = sBasicQuote(s) ' returns 'Did you know that she said "Hi there!"'
s = sBasicQuote(s) ' returns 'Hi there!'
I just can't figure out an algorithm that would work with this... You almost need to replace all double quotes, but when you've replaced the nth double quote you have to skip to the n+1th douple quote?
How does one implement this in VBA?
You could do something like this
Public Sub test()
Dim s As String
s = "I say ""Did you know that she said """"Hi there!"""""""
Debug.Print DoubleQuote(s, 0)
Debug.Print DoubleQuote(s, 1)
Debug.Print DoubleQuote(s, 2)
End Sub
Public Function DoubleQuote(strInput As String, intElement As Integer) As String
Dim a() As String
strInput = Replace(strInput, String(2, Chr(34)), String(1, Chr(34)))
a = Split(strInput, chr(34))
DoubleQuote = a(intElement)
End Function
Another slightly modified version is a little more accurate
`Public Function DoubleQuote(strInput As String, intElement As Integer) As String
Dim a() As String
Dim b() As String
Dim i As Integer
ReDim b(0)
a = Split(strInput, Chr(34))
' ***** See comments re using -1 *******
For i = 0 To UBound(a) - 1
If Len(a(i)) = 0 Then
b(UBound(b)) = Chr(34) & a(i + 1) & Chr(34)
i = i + 1
Else
b(UBound(b)) = a(i)
End If
ReDim Preserve b(UBound(b) + 1)
Next i
DoubleQuote = b(intElement)
End Function`
I think the following will return what you are looking for in your nested quote example. Your first example is not really a situation of nested quotes.
Option Explicit
Sub NestedQuotes()
Const s As String = "I say ""Did you know that she said """"Hi there!"""""""
Dim COL As Collection
Dim Start As Long, Length As Long, sTemp As String, V As Variant
Set COL = New Collection
sTemp = s
COL.Add sTemp
Do Until InStr(sTemp, Chr(34)) = 0
sTemp = COL(COL.Count)
sTemp = Replace(sTemp, String(2, Chr(34)), String(1, Chr(34)))
Start = InStr(sTemp, Chr(34)) + 1
Length = InStrRev(sTemp, Chr(34)) - Start
sTemp = Mid(sTemp, Start, Length)
COL.Add sTemp
Loop
For Each V In COL
Debug.Print V
Next V
End Sub
My Solution
I spent some more time thinking and came up with this solution.
Function sMineDoubleQuoteHierarchy(s As String) As String
'Check the number of quotes in the string are even - sanity check
If (Len(s) - Len(Replace(s, """", ""))) Mod 2 <> 0 Then sMineDoubleQuoteHierarchy = "Error - Odd number of quotes found in sMineDoubleQuoteHierarchy() function": Exit Function
'First thing to do is find the first and last *single* quote in the string
Dim lStart, lEnd, i As Long, fs As String
lStart = InStr(1, s, """")
lEnd = InStrRev(s, """")
'After these have been found we need to remove them.
s = Mid(s, lStart + 1, lEnd - lStart - 1)
'Start at the first character
i = 1
Do While True
'Find where the next double quote is
i = InStr(1, s, """""")
'if no double quote is found then concatenate with fs with the remainder of s
If i = 0 Then Exit Do
'Else add on the string up to the char before the ith quote
fs = fs & Left(s, i - 1)
'Replace the ith double quote with a single quote
s = Left(s, i - 1) & Replace(s, """""", """", i, 1)
'Increment by 1 (ensuring the recently converted double quote is no longer a single quote
i = i + 1
Loop
'Return fs
sMineDoubleQuoteHierarchy = s
End Function
What's going on in this solution?
The first part of the process is removing the first and last single quote from the string and returning the text between them. Then we loop through the string replacing each instance of "" and replacing it with ". Each time we do this we skip to the next character to unsure strings like """" go to "" instead of ".
Does anyone else have a better/more compact solution?
Edit
After all the suggestions in this forum I settled with this. It's got some extra error trapping to find validate nested strings.
Public Function DoubleQuoteExtract(ByVal s As String, Optional ByRef ErrorLevel As Boolean) As String
'This effectively parses the string like BASIC does by removing incidents of "" and replacing them with "
'SANITY CHECK - Check even number of quotes
Dim countQuote As Double
countQuote = Len(s) - Len(Replace(s, """", ""))
'Calculate whether or not quote hierarchy is correct:
'"..." - Is okay - Count Quotes = 2 - Count Quotes / 2 = 1
'""..."" - Is not okay - Count Quotes = 4 - Count Quotes / 2 = 2
'"""...""" - Is okay - Count Quotes = 6 - Count Quotes / 2 = 3
'""""..."""" - Is not okay - Count Quotes = 8 - Count Quotes / 2 = 4
'etc.
'Ultimately: IF CountQuotes/2 = Odd The string hierarchy is setup fine
' IF CountQuotes/2 = Even, The string Hierarchy is setup incorrectly.
Dim X As Double: X = countQuote / 2
Dim ceil As Long: ceil = Int(X) - (X - Int(X) > 0)
If ceil Mod 2 <> 0 Then sDoubleQuoteExtract = "#Error - Incorrect number of double quotes forming an incomplete hierarchy.": GoTo ErrorOccurred
'If an odd number of quotes are found then they cannot be paired correctly, thus throw error
If countQuote Mod 2 <> 0 Then sDoubleQuoteExtract = "#Error - Odd number of quotes found in sMineDoubleQuoteHierarchy() function": GoTo ErrorOccurred
'Find the next incident of single quote. Trim the string to this
s = Mid(s, InStr(1, s, String(1, Chr(34))))
'replace all instances of "" with "
s = Replace(s, String(2, Chr(34)), String(1, Chr(34)))
'Finally trim off the first and last quotes
DoubleQuoteExtract = Mid(s, 2, Len(s) - 2)
ErrorLevel = False
Exit Function
ErrorOccurred:
ErrorLevel = True
End Function
Counting characters in Vb6 without including the spaces (string)
Just use Len() and Replace() to retrieve the length of your string with the spaces removed. For example:
Const strText As String = "The quick brown fox"
Debug.Print "Original length: " & Len(strText) ' => 23
Debug.Print "Length w/o spaces: " & Len(Replace$(strText, " ", "")) ' => 16
Function NonSpaceCount(ByRef Text As String) As Long
Dim I As Long
Dim S As Long
S = Len(Text)
Do While S
S = InStr(I + 1, Text, " ")
If S Then
NonSpaceCount = NonSpaceCount + S - (I + 1)
I = S
Else
NonSpaceCount = NonSpaceCount + Len(Text) - I
End If
Loop
End Function
Speed isn't always everything, but this should be faster than most alternatives.
I need to add brackets around the numbers in a string found in cells on my Excel worksheet.
For example, say I am given:
913/(300+525)
I need to get this in return:
[913]/([300]+[525])
The equations are fairly simple, should only have to deal with + - * / ( ) characters.
I attempted looping through the string character by character using the MID function but I can't get the loop(s) working correctly and end up getting a jumbled mess of random brackets and numbers back. I also considered using regular expressions but I've never used them before and have no idea if this would be a good application.
Please let me know if you need anything else. Thank you for your time!
They can be decently long. Here is another example:
I have:
(544+(1667+1668+1669+1670+1671+1672+1673)-1674)
But I need:
([544]+([1667]+[1668]+[1669]+[1670]+[1671]+[1672]+[1673])-[1674])
I just threw this together but it should work
Function generateBrackets(Equation As String) As String
Dim temp As String
Dim brackets As Boolean
Dim x 'If we're using Option Explicit, or just to be safe
For x = 1 To Len(Equation)
If Not IsNumeric(Mid(Equation, x, 1)) And brackets = False Then
temp = temp & Mid(Equation, x, 1)
ElseIf Not IsNumeric(Mid(Equation, x, 1)) And brackets = True Then
temp = temp & "]" & Mid(Equation, x, 1)
brackets = False
ElseIf IsNumeric(Mid(Equation, x, 1)) And brackets = False Then
temp = temp & "[" & Mid(Equation, x, 1)
brackets = True
ElseIf IsNumeric(Mid(Equation, x, 1)) And brackets = True Then
temp = temp & Mid(Equation, x, 1)
End If
Next x
generateBrackets = temp
End Function
Here is a way which caters for Decimal numbers.
'~~> Add here whatever operators your equation
'~~> is likely to have
Const delim As String = "+()-/"
Sub Sample()
Dim MyAr
Dim sSamp As String
sSamp = "(5.44+(16.67+1668+1669+1670+1671+1672+1673)-1674)"
MyAr = Split(GetNewString(sSamp))
For i = 0 To UBound(MyAr)
sSamp = Replace(sSamp, MyAr(i), "[" & MyAr(i) & "]")
Next i
Debug.Print sSamp
End Sub
Function GetNewString(s As String) As String
Dim sTemp As String
sTemp = s
For i = 1 To Len(delim)
sTemp = Replace(sTemp, Mid(delim, i, 1), " ")
Next i
Do While InStr(1, sTemp, " ")
sTemp = Replace(sTemp, " ", " ")
Loop
GetNewString = Trim(sTemp)
End Function
Input
"(5.44+(16.67+1668+1669+1670+1671+1672+1673)-1674)"
Output
([5.44]+([16.67]+[1668]+[1669]+[1670]+[1671]+[1672]+[1673])-[1674])
I have thousands of addresses in this format:
123 Happy St. Kansas City, MO 64521
9812 Main Street Minneapolis, MN 62154
12 Virgina Ave, Apt 8, Dallas, TX 54334
I want to extract the address, city, state, zip into individual cells (without using VB if possible). I've tried a couple variations of other methods posted, but I can't quite get desired results.
Analyze your problem!
you want to split your address string at the comma
you then want to split the right fragment from (1) at the first blank
ad 1): you get the position of the comma using =FIND(",", A1), and use the result in a =LEFT(...) and a =RIGHT(...) - for the latter you also need the string length (=LEN(...))
B1: =LEFT(A1;FIND(",";A1)-1)
C1: =RIGHT(A1;LEN(A1)-LEN(B1)-2)
Now comes the fun part ... in your 3rd example we mustn't split on the first comma, but on the third comma ... or as a more general rule, we always must split on the last comma .... but how do we find how many commas we have in the string, to feed its position as an additional argument into the =FIND(...) function?
Quick answer: look at Stackoverflow (exactly here) ... very clever ... subtract the length of the string with all commas removed from the original length, and then replace the last occurence of the comma by something else, because =SUBSTITUTE(...) works on occurence, whilst =FIND() only works on position. If you incorporate all this this, you will have
B1: =LEFT(A1;FIND("#";SUBSTITUTE(A1;",";"#"; LEN(A1)-LEN(SUBSTITUTE(A1;",";""))))-1) --> full address
C1: (same as above)
Here we use "#" as a neutral substitution string for the final comma as we asume that no address uses the "#"
ad 2): you apply the above (with blank instead of comma) once again to the right part. You can use the simple first version of the formulae as it's clear you want to split at the first blank
D1: =LEFT(C1;FIND(" ";C1)-1) --> state
E1: =RIGHT(C1;LEN(C1)-LEN(D1)-1) --> ZIP code
This VBA function extracts Zip, State, City, Street1, and Street2 (Suite, Apt, etc.) into separate columns. Would need minor modification to remove commas.
Option Explicit
Function ParseAddress(ByVal varAddress As Variant, ByVal strAddressPart As String) As String
Dim aryAddressTokens() As String
Dim strCity As String
Dim intCtr As Integer
Dim intStreet2Tokens As Integer
Dim strStreet1, strStreet2 As String
If IsMissing(varAddress) Or varAddress = vbNullString Then
ParseAddress = ""
Else
aryAddressTokens = Split(Trim(varAddress), " ")
'
If strAddressPart = "Zip" Then
ParseAddress = aryAddressTokens(UBound(aryAddressTokens))
ElseIf strAddressPart = "State" Then
ParseAddress = UCase(aryAddressTokens(UBound(aryAddressTokens) - 1))
ElseIf strAddressPart = "City" Then
strCity = aryAddressTokens(UBound(aryAddressTokens) - 2)
If Right(strCity, 1) = "," Then strCity = Left(strCity, Len(strCity) - 1)
ParseAddress = strCity
ElseIf strAddressPart = "Street1" Or strAddressPart = "Street2" Then
'Find Street2 if present because Street1 output is dependent on it.
' Assume address never begins with a # or Suite.
intCtr = 1
strStreet2 = ""
intStreet2Tokens = 0
While (intCtr < UBound(aryAddressTokens) - 2) And strStreet2 = ""
If Left(aryAddressTokens(intCtr), 1) = "#" Then
If Len(aryAddressTokens(intCtr)) = 1 Then
strStreet2 = aryAddressTokens(intCtr) & aryAddressTokens(intCtr + 1)
intStreet2Tokens = 2
Else
strStreet2 = aryAddressTokens(intCtr)
intStreet2Tokens = 1
End If
ElseIf Left(aryAddressTokens(intCtr), 5) = "Suite" Then
If Len(aryAddressTokens(intCtr)) = 5 Then
strStreet2 = aryAddressTokens(intCtr) & " " & aryAddressTokens(intCtr + 1)
intStreet2Tokens = 2
Else
strStreet2 = aryAddressTokens(intCtr)
intStreet2Tokens = 1
End If
ElseIf Left(aryAddressTokens(intCtr), 3) = "Apt" Then
strStreet2 = aryAddressTokens(intCtr) & " " & aryAddressTokens(intCtr + 1)
intStreet2Tokens = 2
End If
intCtr = intCtr + 1
Wend
If Not IsEmpty(strStreet2) Then
If Right(strStreet2, 1) = "," Then strStreet2 = Left(strStreet2, Len(strStreet2) - 1)
End If
' Now Street1.
strStreet1 = ""
For intCtr = 0 To UBound(aryAddressTokens) - (3 + intStreet2Tokens)
strStreet1 = strStreet1 & " " & aryAddressTokens(intCtr)
Next
If Right(strStreet1, 1) = "," Then strStreet1 = Left(strStreet1, Len(strStreet1) - 1)
'Assign.
If strAddressPart = "Street1" Then
ParseAddress = Trim(strStreet1)
Else
ParseAddress = Trim(strStreet2)
End If
End If
End If
End Function