Regex get Group Names? - swift

Looking for a way to get Group Names using Swift, I found info using GetGroupNames in C# (How do I get the name of captured groups in a C# Regex?) but can't find anything similar. Want to be able to tell is a group exists.
At the moment using code below ...
let search = "Agora.2010.1080p.BluRay.x264.mp4"
let regex "^(Agora.)(?<"group name">2010)?"
let return_results = regex(for: regex, in: search)
func regex(for expression: String, in string: String) -> [String]? {
guard let regex = try? NSRegularExpression(pattern: expression, options: [.caseInsensitive]) else { return nil }
let results = regex.matches(in: string, range: NSRange(string.startIndex..., in: string))
guard !results.isEmpty else { return nil }
var values: [String] = []
guard regex.numberOfCaptureGroups > 0 else { return nil }
for r in 1...(regex.numberOfCaptureGroups) {
let range = results[0].range(at: r)
if range.length > 0 {
let start = string.index(string.startIndex, offsetBy: range.location)
let end = string.index(start, offsetBy: range.length)
let value = string[start..<end]
if !value.isEmpty {
let value_string = String(value)
values.append(value_string)
}
}
}
return values
At the moment this just looks for any groups. What I want is to return is a Dictionary list of ["group name":"value"]. I've tried from attached link ... "regex.Match(line).Groups" & "regex.GetGroupNames()" which do not exist.

Related

Replace in string with regex

I am struggling to modify captured value with regex.
For example, I wanna change "Hello, he is hero" to "HEllo, HE is HEro" using Regex.
I know there are ways to change this without regex, but it is just an example to show the problem. I actually use the regex instead of just he, but I cannot provide it here. That is why using regex is required.
The code below somehow does not work. Are there any ways to make it work?
"Hello, he is hero".replacingOccurrences(
of: #"(he)"#,
with: "$1".uppercased(), // <- uppercased is not applied
options: .regularExpression
)
You need to use your regex in combination with Range (range(of:)) to find matches and then replace each found range separately
Here is a function as an extension to String that does this by using range(of:) starting from the start of the string and then moving the start index to match from forward to after the last match. The actual replacement is done inside a separate function that is passed as an argument
extension String {
func replace(regex: String, with replace: (Substring) -> String) -> String {
var string = self
var startIndex = self.startIndex
let endIndex = self.endIndex
while let range = string.range(of: regex, options: [.regularExpression] , range: startIndex..<endIndex) {
if range.isEmpty {
startIndex = string.index(startIndex, offsetBy: 1)
if startIndex >= endIndex { break }
continue
}
string.replaceSubrange(range, with: replace(string[range]))
startIndex = range.upperBound
}
return string
}
}
Example where we do an case insensitive search for words starting with "he" and replace each match with the uppercased version
let result = "Hello, he is hero. There he is".replace(regex: #"(?i)\bhe"#) {
$0.uppercased()
}
Output
HEllo, HE is HEro. There HE is
You can try NSRegularExpression. Something like:
import Foundation
var sourceStr = "Hello, he is hero"
let regex = try! NSRegularExpression(pattern: "(he)")
let matches = regex.matches(in: sourceStr, range: NSRange(sourceStr.startIndex..., in: sourceStr))
regex.enumerateMatches(in: sourceStr, range: NSRange(sourceStr.startIndex..., in: sourceStr)) { (match, _, _) in
guard let match = match else { return }
guard let range = Range(match.range, in: sourceStr) else { return }
let sub = sourceStr[range]
sourceStr = sourceStr.replacingOccurrences(of: sub, with: sub.uppercased(), options: [], range: range)
}
print(sourceStr)
this is the solution i can provide
var string = "Hello, he is hero"
let occurrence = "he"
string = string.lowercased().replacingOccurrences(
of: occurrence,
with: occurrence.uppercased(),
options: .regularExpression
)
print(string)

Swift 5.1 Regex Bug

for the following code:
import Foundation
extension String {
var fullRange: NSRange {
return .init(self.startIndex ..< self.endIndex, in: self)
}
public subscript(range: Range<Int>) -> Self.SubSequence {
let st = self.index(self.startIndex, offsetBy: range.startIndex)
let ed = self.index(self.startIndex, offsetBy: range.endIndex)
let sub = self[st ..< ed]
return sub
}
func split(regex pattern: String) throws -> [String] {
let regex = try NSRegularExpression.init(pattern: pattern, options: [])
let fRange = self.fullRange
let match = regex.matches(in: self, options: [], range: fRange)
var list = [String]()
var start = 0
for m in match {
let r = m.range
let end = r.location
list.append(String(self[start ..< end]))
start = end + r.length
}
if start < self.count {
list.append(String(self[start ..< self.count]))
}
return list
}
}
print(try! "مرتفع جداً\nVery High".split(regex: "\n"))
the output should be :
["مرتفع جداً", "Very High"]
but instead it is:
["مرتفع جداً\n", "ery High"]
that because regex (for this case) matched the \n at the offset 10 instead of 9
is there any thing wrong in my code, or it is a bug in swift with regex !!
It's not a bug. You are trying to use Int indexes which is error-prone and strongly discouraged in an Unicode environment.
This is the equivalent of your code with the proper String.Index type and the dedicated API to convert NSRange to Range<String.Index> and vice versa. fullRange and subscript are obsolete.
I just left out the print line. startIndex and endIndex are properties of String
extension String {
func split(regex pattern: String) throws -> [String] {
let regex = try NSRegularExpression(pattern: pattern)
let matches = regex.matches(in: self, range: NSRange(startIndex..., in: self))
var list = [String]()
var start = startIndex
for match in matches {
let range = Range(match.range, in: self)!
let end = range.lowerBound
list.append(String(self[start..<end]))
start = range.upperBound
}
if start < endIndex {
list.append(String(self[start..<endIndex]))
}
return list
}
}
print(try! "مرتفع جداً\nVery High".split(regex: "\n"))
The result is ["مرتفع جداً", "Very High"]
I found the issue behind this bug?!
Swift Strings are so much weirder than any other language; since every character is 4 bytes length, then a single character (may, would, will, ..) contains 1 or 2 unicode characters (witch what happened in my case), so the solution is to subarray the unicodeScalars of the swift String instead of the string it self !!

Swift Regex optimization

i want to look in an array of strings to get all strings containing subbstring. This function should also work with wildcard.
I wrote this function:
func wordcontains(word: String, from words: [String]) -> [String] {
//Si il y a des jokers on utilise la methode regex
//Sinon on utilise la methode simple car beaucoup plus rapide
let foundWords = words.filter { otherWord in
let wordregex = word.replacingOccurrences(of: "?", with: ".")
if (otherWord.range(of: "[A-Z]*\(wordregex)[A-Z]*", options: .regularExpression) != nil){
return true
}else {
return false
}
}
return foundWords
}
and it's working like that:
input : anagrams(word: "ARC?", from: ["BOU", "BAC", "ARCS", "ARCH", "TREE","ARCHE","PROUE"])
output : ["ARCS", "ARCH", "ARCHE"]
it's working well with a small array, but i need to check in an array of 300000 words and it take a while.
What is the best way to optimize the regex / function?
Perhaps there is a better approch ?
For your interest, the code I used for testing.
Create a Command Line Tool project.
import Foundation
func wordcontains(word: String, from words: [String]) -> [String] {
...(exactly the same code as yours)...
}
///Creating NSRegularExpression outside of the loop
func wordcontains2(word: String, from words: [String]) -> [String] {
let wordregex = word.replacingOccurrences(of: "?", with: ".")
let pattern = "[A-Z]*\(wordregex)[A-Z]*"
let regex: NSRegularExpression
do {
regex = try NSRegularExpression(pattern: pattern)
} catch {
fatalError(error.localizedDescription)
}
let foundWords = words.filter { otherWord in
regex.firstMatch(in: otherWord, range: NSRange(0..<otherWord.utf16.count)) != nil
}
return foundWords
}
/// Removing `[A-Z]*` from both ends as suggested in rmaddy's comment.
/// This assumes all words in the parameter `words` consists only capital letters.
func wordcontains3(word: String, from words: [String]) -> [String] {
let wordregex = word.replacingOccurrences(of: "?", with: ".")
let regex: NSRegularExpression
do {
regex = try NSRegularExpression(pattern: wordregex)
} catch {
fatalError(error.localizedDescription)
}
let foundWords = words.filter { otherWord in
regex.firstMatch(in: otherWord, range: NSRange(0..<otherWord.utf16.count)) != nil
}
return foundWords
}
Generally, creating an instance of NSRegularExpression is an expensive operation, so moving it outside of the loop may improve the performance (of course, in case the regex does not change), but the effect is very limited.
And I added some code for testing.
func makeRandomWords(_ count: Int) -> [String] {
var words: [String] = []
for _ in 0..<count {
let len = Int.random(in: 3...5)
var word = ""
for _ in 0..<len {
let charCode = UInt32.random(in: UInt32(UInt8(ascii: "A"))...UInt32(UInt8(ascii: "Z")))
word.append(Character(UnicodeScalar(charCode)!))
}
words.append(word)
}
return words
}
let words = makeRandomWords(300_000) //I have found the number of words is `300000` after I wrote my comment...
do {
let date1 = Date()
let w1 = wordcontains(word: "ARC?", from: words)
let date2 = Date()
print(date2.timeIntervalSince(date1), w1)
let date3 = Date()
let w2 = wordcontains2(word: "ARC?", from: words)
let date4 = Date()
print(date4.timeIntervalSince(date3), w2)
let date5 = Date()
let w3 = wordcontains3(word: "ARC?", from: words)
let date6 = Date()
print(date6.timeIntervalSince(date5), w3)
}
Result:
6.443639039993286 ["ARCQJ", "ARCZB", "AARCI", "ARCR", "ARCR", "ARCQS", "ARCGM", "ARCKL", "UARCN", "FARCS", "ARCNA", "ARCZM", "PARCL", "ARCTA", "ARCS", "ARCE", "ARCG", "ARCE"]
1.7534430027008057 ["ARCQJ", "ARCZB", "AARCI", "ARCR", "ARCR", "ARCQS", "ARCGM", "ARCKL", "UARCN", "FARCS", "ARCNA", "ARCZM", "PARCL", "ARCTA", "ARCS", "ARCE", "ARCG", "ARCE"]
1.4359259605407715 ["ARCQJ", "ARCZB", "AARCI", "ARCR", "ARCR", "ARCQS", "ARCGM", "ARCKL", "UARCN", "FARCS", "ARCNA", "ARCZM", "PARCL", "ARCTA", "ARCS", "ARCE", "ARCG", "ARCE"]
The result may change as this code uses random words, but the consumed times may not show big difference for each run.

How to use Swift NSRegularExpression to get uppercased letter?

I have a string like this:
"te_st" and like to replace all underscores followed by a character with the uppercased version of this character.
From "te_st" --> Found (regex: "_.") --------replace with next char (+ uppercase ("s"->"S")--------> "teSt"
From "te_st" ---> to "teSt"
From "_he_l_lo" ---> to "HeLLo"
From "an_o_t_h_er_strin_g" ---> to "anOTHErStrinG"
... but I can not really get it working using Swift's NSRegularExpression like this small snipped does:
var result = "te_st" // result should be teSt
result = try! NSRegularExpression(pattern: "_*").stringByReplacingMatches(in: result, range: NSRange(0..<result.count), withTemplate: ("$1".uppercased()))
There's no regular syntax to convert a match to uppercase. The code you posted is attempting to convert the string $1 to uppercase which is of course just $1. It isn't attempting to convert the value represented by the $1 match at runtime.
Here's another approach using a regular expression to find the _ followed by a lowercase letter. Those are enumerated and replaced with the uppercase letter.
extension String {
func toCamelCase() -> String {
let expr = try! NSRegularExpression(pattern: "_([a-z])")
var res = self
for match in expr.matches(in: self, range: NSRange(0..<res.count)).reversed() {
let range = Range(match.range, in: self)!
let letterRange = Range(match.range(at: 1), in: self)!
res.replaceSubrange(range, with: self[letterRange].uppercased())
}
return res
}
}
print("te_st".toCamelCase())
print("_he_l_lo".toCamelCase())
print("an_o_t_h_er_strin_g".toCamelCase())
This outputs:
teSt
HeLLo
anOTHErStrinG
Here is one implementation using NSRegularExpression. I use group match to get the character after _ and capitalize it and replace the string.
func capitalizeLetterAfterUnderscore(string: String) -> String {
var capitalizedString = string
guard let regularExpression = try? NSRegularExpression(pattern: "_(.)") else {
return capitalizedString
}
let matches = regularExpression.matches(in: string,
options: .reportCompletion,
range: NSMakeRange(0, string.count))
for match in matches {
let groupRange = match.range(at: 1)
let index = groupRange.location
let characterIndex = string.index(string.startIndex,
offsetBy: index)
let range = characterIndex ... characterIndex
let capitalizedCharacter = String(capitalizedString[characterIndex]).capitalized
capitalizedString = capitalizedString.replacingCharacters(in: range,
with: capitalizedCharacter)
}
capitalizedString = capitalizedString.replacingOccurrences(of: "_", with: "")
return capitalizedString
}
capitalizeLetterAfterUnderscore(string: "an_o_t_h_er_strin_g") // anOTHErStrinG
And here is other one without using regular expression. I made extension for method which could also be reused.
extension String {
func indexes(of character: String) -> [Index] {
precondition(character.count == 1, "character should be single letter string")
return enumerated().reduce([]) { (partial, component) in
let currentIndex = index(startIndex,
offsetBy: component.offset)
return String(self[currentIndex]) == character
? partial + [currentIndex]
: partial
}
}
func capitalizeLetter(after indexes: [Index]) -> String {
var modifiedString = self
for currentIndex in indexes {
guard let letterIndex = index(currentIndex,
offsetBy: 1,
limitedBy: endIndex)
else { continue }
let range = letterIndex ... letterIndex
modifiedString = modifiedString.replacingCharacters(in: range,
with: self[range].capitalized)
}
return modifiedString
}
}
let string = "an_o_t_h_er_strin_g"
let newString = string.capitalizeLetter(after: string.indexes(of: "_"))
.replacingOccurrences(of: "_",with: "")
You can use string range(of:, options:, range:) method with .regularExpression options to match the occurrences of "_[a-z]" and replace the subranges iterating the ranges found at reversed order by the character at the index after the range lowerbound uppercased:
let string = "an_o_t_h_er_strin_g"
let regex = "_[a-z]"
var start = string.startIndex
var ranges:[Range<String.Index>] = []
while let range = string.range(of: regex, options: .regularExpression, range: start..<string.endIndex) {
start = range.upperBound
ranges.append(range)
}
var finalString = string
for range in ranges.reversed() {
finalString.replaceSubrange(range, with: String(string[string.index(after: range.lowerBound)]).uppercased())
}
print(finalString) // "anOTHErStrinG\n"
The problem is that it is converting the string "$1" to upper case (which is, unsurprisingly unchanged, just "$1") and using "$1" as the template. If you want to use regex, you will have to enumerate through matches yourself.
The alternative is to split the string by _ characters and uppercase the first character of every substring (except the first) and joining it back together using reduce:
let input = "te_st"
let output = input.components(separatedBy: "_").enumerated().reduce("") { $0 + ($1.0 == 0 ? $1.1 : $1.1.uppercasedFirst()) }
Or, if your goal isn't to write code as cryptic as most regex, we can make that a tad more legible:
let output = input
.components(separatedBy: "_")
.enumerated()
.reduce("") { result, current in
if current.offset == 0 {
return current.element // because you don’t want the first component capitalized
} else {
return result + current.element.uppercasedFirst()
}
}
Resulting in:
teSt
Note, that uses this extension for capitalizing the first character:
extension String {
func uppercasedFirst(with locale: Locale? = nil) -> String {
guard count > 0 else { return self }
return String(self[startIndex]).uppercased(with: locale) + self[index(after: startIndex)...]
}
}
If you want to do sort of dynamic conversion with NSRegularExpression, you can subclass NSRegularExpression and override replacementString(for:in:offset:template:):
class ToCamelRegularExpression: NSRegularExpression {
override func replacementString(for result: NSTextCheckingResult, in string: String, offset: Int, template templ: String) -> String {
if let range = Range(result.range(at: 1), in: string) {
return string[range].uppercased()
} else {
return super.replacementString(for: result, in: string, offset: 0, template: templ)
}
}
}
func toCamelCase(_ input: String) -> String { //Make this a String extension if you prefer...
let regex = try! ToCamelRegularExpression(pattern: "_(.)")
return regex.stringByReplacingMatches(in: input, options: [], range: NSRange(0..<input.utf16.count), withTemplate: "$1")
}
print(toCamelCase("te_st")) //-> teSt
print(toCamelCase("_he_l_lo")) //-> HeLLo
print(toCamelCase("an_o_t_h_er_strin_g")) //-> anOTHErStrinG

Trying to parse HTML in Swift 4 using only the Standard Library

I'm trying to parse some HTML to pull all links that come after any occurrences of the string:
market_listing_row_link" href="
to gather a list of item URL's using only the Swift 4 Standard Library.
What I think I need is a for loop that keeps on checking characters with a condition that once the full string is found, it then starts reading the following item URL into an array until a double quote is reached, stopping and then repeating this process until the end of file. Slightly familiar in C we had access to a function (I think it was fgetc) that did this while advancing a position indicator for the file. Is there any similar way to do this in Swift?
My code so far can only find the first occurrence of the string I'm looking for when there are 10 I need to find.
import Foundation
extension String {
func slice(from: String, to: String) -> String? {
return (range(of: from)?.upperBound).flatMap { substringFrom in
(range(of: to, range: substringFrom..<endIndex)?.lowerBound).map { substringTo in
String(self[substringFrom..<substringTo])
}
}
}
}
let itemListURL = URL(string: "http://steamcommunity.com/market/search?appid=252490")!
let itemListHTML = try String(contentsOf: itemListURL, encoding: .utf8)
let itemURL = URL(string: itemListHTML.slice(from: "market_listing_row_link\" href=\"", to: "\"")!)!
print(itemURL)
// Prints the current first URL found matching: http://steamcommunity.com/market/listings/252490/Wyrm%20Chest
You can use regex to find all string occurrences between two specific strings (check this SO answer) and use the extension method ranges(of:) from this answer to get all ranges of that regex pattern. You just need to pass options .regularExpression to that method.
extension String {
func ranges(of string: String, options: CompareOptions = .literal) -> [Range<Index>] {
var result: [Range<Index>] = []
var start = startIndex
while let range = range(of: string, options: options, range: start..<endIndex) {
result.append(range)
start = range.lowerBound < range.upperBound ? range.upperBound : index(range.lowerBound, offsetBy: 1, limitedBy: endIndex) ?? endIndex
}
return result
}
func slices(from: String, to: String) -> [Substring] {
let pattern = "(?<=" + from + ").*?(?=" + to + ")"
return ranges(of: pattern, options: .regularExpression)
.map{ self[$0] }
}
}
Testing playground
let itemListURL = URL(string: "http://steamcommunity.com/market/search?appid=252490")!
let itemListHTML = try! String(contentsOf: itemListURL, encoding: .utf8)
let result = itemListHTML.slices(from: "market_listing_row_link\" href=\"", to: "\"")
result.forEach({print($0)})
Result
http://steamcommunity.com/market/listings/252490/Night%20Howler%20AK47
http://steamcommunity.com/market/listings/252490/Hellcat%20SAR
http://steamcommunity.com/market/listings/252490/Metal
http://steamcommunity.com/market/listings/252490/Volcanic%20Stone%20Hatchet
http://steamcommunity.com/market/listings/252490/Box
http://steamcommunity.com/market/listings/252490/High%20Quality%20Bag
http://steamcommunity.com/market/listings/252490/Utilizer%20Pants
http://steamcommunity.com/market/listings/252490/Lizard%20Skull
http://steamcommunity.com/market/listings/252490/Frost%20Wolf
http://steamcommunity.com/market/listings/252490/Cloth