Transliteration and fuzzy search, like Google suggestions - search

I need to do a fuzzy search with transliteration of the characters, for example:
I have an ASP.NET application, database, which has a table with a list of Spanish words (200,000 entries), I also have a page with an input field. The point is that I do not know Spanish, and I do not know how to spell a search word in Spanish, but I know how it sounds. Therefore, in the text box I enter the search word, such as "beautiful", but in the recording err - "prekieso", and I need to get from the database got the correct version: "precioso".
How can this be implemented? In other words, I need something similar to Google suggestions...

I think what you need here is a spell checking functionality like this one: http://www.codeproject.com/KB/string/netspell.aspx
A google like functionality it's much more advanced though and will not be easy to implement:
How does the Google "Did you mean?" Algorithm work?
hope this help.

The stored procedure / function, the algorithm calculates the distance Levenshtein:
USE [**dbname**]
GO
/****** Object: UserDefinedFunction [dbo].[levenshtein] Script Date: 05/27/2013 17:54:05 ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
ALTER FUNCTION [dbo].[levenshtein](#left varchar(100), #right varchar(100))
returns int
as
BEGIN
DECLARE #difference int, #lenRight int, #lenLeft int, #leftIndex int, #rightIndex int, #left_char char(1), #right_char char(1), #compareLength int
SET #lenLeft = LEN(#left)
SET #lenRight = LEN(#right)
SET #difference = 0
If #lenLeft = 0
BEGIN
SET #difference = #lenRight GOTO done
END
If #lenRight = 0
BEGIN
SET #difference = #lenLeft
GOTO done
END
GOTO comparison
comparison:
IF (#lenLeft >= #lenRight)
SET #compareLength = #lenLeft
Else
SET #compareLength = #lenRight
SET #rightIndex = 1
SET #leftIndex = 1
WHILE #leftIndex <= #compareLength
BEGIN
SET #left_char = substring(#left, #leftIndex, 1)
SET #right_char = substring(#right, #rightIndex, 1)
IF #left_char <> #right_char
BEGIN -- Would an insertion make them re-align?
IF(#left_char = substring(#right, #rightIndex+1, 1))
SET #rightIndex = #rightIndex + 1
-- Would an deletion make them re-align?
ELSE
IF(substring(#left, #leftIndex+1, 1) = #right_char)
SET #leftIndex = #leftIndex + 1
SET #difference = #difference + 1
END
SET #leftIndex = #leftIndex + 1
SET #rightIndex = #rightIndex + 1
END
GOTO done
done:
RETURN #difference
END
invoking:
select
dbo.edit_distance('Fuzzy String Match','fuzzy string match'),
dbo.edit_distance('fuzzy','fuzy'),
dbo.edit_distance('Fuzzy String Match','fuzy string match'),
dbo.edit_distance('levenshtein distance sql','levenshtein sql server'),
dbo.edit_distance('distance','server')
or:
SELECT [Name]
FR OM [tempdb].[dbo].[Names]
WHERE dbo.edit_distance([Name],'bozhestvennia') <= 3

Related

Text values in formulas are limited to 255 characters to create longer text values in formula

I want to insert some big values in an excel sheet, but it is giving errors.
Error
Text values in formulas are limited to 255 characters. To create text
values longer in a formula, use the CONCATENATE
function or the concatenation operator (&).
Value
=CONCATENATE("BEGIN TRY
BEGIN TRANSACTION
BEGIN --User defined values
DECLARE #FieldName NVARCHAR(100) = '",F2,"';
DECLARE #Currencycode VARCHAR(3) = '",A2,"';
DECLARE #Countrycode VARCHAR(2) = '",B2,"';
DECLARE #RuleType NVARCHAR(100) = 'CannotContainSpecialCharOtherThan';
DECLARE #RuleValue INT = 0;
DECLARE #InsFieldTypeDesc NVARCHAR(50) = 'AnyType';
DECLARE #RuleValueAlpha NVARCHAR(4000) = '/: (),.''-?+';
DECLARE #ErrMsg NVARCHAR(500) = '",N2,"';
DECLARE #ErrCode NVARCHAR(10) = '",M2,"';
DECLARE #ErrPrior TINYINT = '",L2,"';
DECLARE #IsLength INT = 0; ---if length is there then set this 1 or 0
DECLARE #DependantOn NVARCHAR(100) = NULL
END
BEGIN --Consts
DECLARE #UTCTime DATETIME = Getutcdate();
DECLARE #CTTime DATETIME = Dateadd(hour, -5, Getutcdate());
END
DECLARE #FieldRuleConfigPk INT;
DECLARE #ErrorMsgpk INT;
DECLARE #countryfk INT;
DECLARE #Fieldnamefk INT;
DECLARE #InsFieldTypeFk INT;
DECLARE #RuleValue1 INT = 0;
DECLARE #RuleValue2 INT = 19;
DECLARE #RuleValue3 INT = 19;
DECLARE #RuleTypeFk INT;
DECLARE #ErrorFk INT;
DECLARE #IsOk INT = 0
PRINT( 'Starts script' )
BEGIN --INIT
SET #countryfk = (SELECT countrypk
FROM mas_country
WHERE countrycode = #Countrycode);
SET #Fieldnamefk = (SELECT fieldnamepk
FROM mas_fieldname
WHERE fieldname = #FieldName);
SET #InsFieldTypeFk = (SELECT instructionfieldtypepk
FROM mas_instructionfieldtype
WHERE fieldtypedesc = #InsFieldTypeDesc);
END
IF NOT EXISTS (SELECT 1
FROM mas_fieldruleconfig
WHERE currencycode = #Currencycode
AND countryfk = #countryfk
AND fieldnamefk = #Fieldnamefk)
BEGIN
INSERT INTO mas_fieldruleconfig
(currencycode,
countryfk,
fieldnamefk,
instructionfieldtypefk,
createddateutc,
createddatect)
VALUES ( #Currencycode,
#countryfk,
#Fieldnamefk,
#InsFieldTypeFk,
#UTCTime,
#CTTime )
SELECT #FieldRuleConfigPk = Scope_identity();
SET #IsOk = 1
PRINT( 'mas_fieldruleconfig insert success' )
END
ELSE
BEGIN
SELECT #FieldRuleConfigPk = fieldruleconfigpk
FROM mas_fieldruleconfig
WHERE currencycode = #Currencycode
AND countryfk = #countryfk
AND fieldnamefk = #Fieldnamefk
PRINT( 'mas_fieldruleconfig setting already exists' )
END
IF NOT EXISTS (SELECT 1
FROM mas_ruletype
WHERE ruletype = #RuleType
AND fieldruleconfigfk = #FieldRuleConfigPk
--Need to add this check, otherwise it will fail
AND rulevalue = #RuleValue --Not needed
AND rulevaluealphanumeric = #RuleValueAlpha)
--Not needed
BEGIN
INSERT INTO mas_ruletype
(ruletype,
fieldruleconfigfk,
rulevalue,
rulevaluealphanumeric,
createddateutc,
createddatect,
dependanton)
VALUES ( #RuleType,
#FieldRuleConfigPk,
#RuleValue,
#RuleValueAlpha,
#UTCTime,
#CTTime,
#DependantOn)
SELECT #RuleTypeFk = Scope_identity();
SET #IsOk = 1
PRINT( 'mas_ruletype insert success' )
END
ELSE
BEGIN
SELECT #RuleTypeFk = (SELECT ruletypepk
FROM mas_ruletype
WHERE ruletype = #RuleType
AND fieldruleconfigfk =
#FieldRuleConfigPk
--Need to add this check, otherwise it will fail
AND rulevalue = #RuleValue
--Not needed
AND rulevaluealphanumeric =
#RuleValueAlpha)
PRINT( 'mas_ruletype settings already exists' )
END
IF NOT EXISTS (SELECT 1
FROM mas_errormessage
WHERE errormessage = #ErrMsg)
BEGIN
INSERT INTO mas_errormessage
(errormessage,
createddateutc,
createddatect)
VALUES ( #ErrMsg,
#UTCTime,
#CTTime )
SELECT #ErrorMsgpk = Scope_identity()
SET #IsOk = 1
PRINT( 'mas_errmsg insert success' )
END
ELSE
BEGIN
SELECT #ErrorMsgpk = errormessagepk
FROM mas_errormessage
WHERE errormessage = #ErrMsg
PRINT( 'mas_errormsg settings already exists' )
END
IF NOT EXISTS (SELECT 1
FROM mas_error
WHERE fieldnamefk = #Fieldnamefk
AND errorcode = #ErrCode)
BEGIN
INSERT INTO mas_error
(fieldnamefk,
errorcode,
errorpriority,
errormessagefk,
createddateutc,
createddatect)
VALUES ( #Fieldnamefk,
#ErrCode,
#ErrPrior,
#ErrorMsgpk,
#UTCTime,
#CTTime )
SELECT #ErrorFk = Scope_identity();
SET #IsOk = 1
PRINT( 'mas_error insert success' )
END
ELSE
BEGIN
SELECT #ErrorFk = (SELECT errorpk
FROM mas_error
WHERE fieldnamefk = #Fieldnamefk
AND errorcode = #ErrCode);
PRINT( 'Mas_Error settings already exists' )
END
IF NOT EXISTS (SELECT 1
FROM [lnk_fieldruleerror]
WHERE [fieldruleconfigfk] = #FieldRuleConfigPk
AND [fieldnamefk] = #Fieldnamefk
AND [ruletypefk] = #RuleTypeFk
AND #ErrorFk = #ErrorFk)
BEGIN
INSERT INTO [dbo].[lnk_fieldruleerror]
([fieldruleconfigfk],
[fieldnamefk],
[ruletypefk],
[errorfk])
VALUES (#FieldRuleConfigPk,
#Fieldnamefk,
#RuleTypeFk,
#ErrorFk)
END
IF #IsOk = 1
BEGIN
COMMIT TRANSACTION
PRINT( 'commit' )
END
END TRY
BEGIN CATCH
PRINT 'Error'
PRINT ( 'Rollback' )
ROLLBACK TRANSACTION;
END CATCH")
I have followed this link also but could not find a solution.
Try breaking it up like this:
=CONCATENATE("BEGIN TRY",CHAR(10),
"BEGIN TRANSACTION",CHAR(10),
"dgd",CHAR(10),
"dfh",CHAR(10),
etc.
);
I have solved by splitting the value into multiple cells and merging these into a new cell like this =A2&" "&B2

Fastest way to conditionally strip off the right part of a string

I need to remove the numeric part at the end of a string. Here are some examples:
"abcd1234" -> "abcd"
"a3bc45" -> "a3bc"
"kj3ih5" -> "kj3ih"
You get the idea.
I implemented a function which works well for this purpose.
Function VarStamm(name As String) As String
Dim i, a As Integer
a = 0
For i = Len(name) To 1 Step -1
If IsNumeric(Mid(name, i, 1)) = False Then
i = i + 1
Exit For
End If
Next i
If i <= Len(name) Then
VarStamm = name.Substring(0, i - 1)
Else
VarStamm = name
End If
End Function
The question is: is there any faster (more efficient in speed) way to do this? The problem is, I call this function within a loop with 3 million iterations and it would be nice to have it be more efficient.
I know about the String.LastIndexOf method, but I don't know how to use it when I need the index of the last connected number within a string.
You can use Array.FindLastIndex and then Substring:
Dim lastNonDigitIndex = Array.FindLastIndex(text.ToCharArray(), Function(c) Not char.IsDigit(c))
If lastNonDigitIndex >= 0
lastNonDigitIndex += 1
Dim part1 = text.Substring(0, lastNonDigitIndex)
Dim part2 = text.Substring(lastNonDigitIndex)
End If
I was skeptical that the Array.FindLastIndex method was actually faster, so I tested it myself. I borrowed the testing code posted by Amessihel, but added a third method:
Function VarStamm3(name As String) As String
Dim i As Integer
For i = name.Length - 1 To 0 Step -1
If Not Char.IsDigit(name(i)) Then
Exit For
End If
Next i
Return name.Substring(0, i + 1)
End Function
It uses your original algorithm, but just swaps out the old VB6-style string methods for newer .NET equivalent ones. Here's the results on my machine:
RunTime :
- VarStamm : 00:00:07.92
- VarStamm2 : 00:00:00.60
- VarStamm3 : 00:00:00.23
As you can see, your original algorithm was already quite well tuned. The problem wasn't the loop. The problem was Mid, IsNumeric, and Len. Since Tim's method didn't use those, it was much faster. But, if you stick with a manual for loop, it's twice as fast as using Array.FindLastIndex, all things being equal
Given your function VarStamm and Tim Schmelter's one named VarStamm2, here is a small test performance I wrote. I typed an arbitrary long String with a huge right part, and ran the functions one million times.
Module StackOverlow
Sub Main()
Dim testStr = "azekzoerjezoriezltjreoitueriou7657678678797897898997897978897898797989797"
Console.WriteLine("RunTime :" + vbNewLine +
" - VarStamm : " + getTimeSpent(AddressOf VarStamm, testStr) + vbNewLine +
" - VarStamm2 : " + getTimeSpent(AddressOf VarStamm2, testStr))
End Sub
Function getTimeSpent(f As Action(Of String), str As String) As String
Dim sw As Stopwatch = New Stopwatch()
Dim ts As TimeSpan
sw.Start()
For i = 1 To 1000000
f(str)
Next
sw.Stop()
ts = sw.Elapsed
Return String.Format("{0:00}:{1:00}:{2:00}.{3:00}",
ts.Hours, ts.Minutes, ts.Seconds,
ts.Milliseconds / 10)
End Function
Function VarStamm(name As String) As String
Dim i, a As Integer
a = 0
For i = Len(name) To 1 Step -1
If IsNumeric(Mid(name, i, 1)) = False Then
i = i + 1
Exit For
End If
Next i
If i <= Len(name) Then
VarStamm = name.Substring(0, i - 1)
Else
VarStamm = name
End If
End Function
Function VarStamm2(name As String) As String
Dim lastNonDigitIndex = Array.FindLastIndex(name.ToCharArray(), Function(c) Not Char.IsDigit(c))
If lastNonDigitIndex >= 0 Then
lastNonDigitIndex += 1
Return name.Substring(0, lastNonDigitIndex)
End If
Return name
End Function
End Module
Here is the output I got:
RunTime :
- VarStamm : 00:00:38.33
- VarStamm2 : 00:00:02.72
So yes, you should choose his answer, his code is both pretty and efficient.

How are nested if-else loops resolved in BASIC

I've got this legacy code I'm analyzing:
If (X) then
if Cnt < 4 then Cnt = Cnt + 1 ; 4 samples
Else
if Cnt > 0 then Cnt = Cnt-1 ; keep history
EndIf
Which has Cnt go up and down depending on X
And I'm wondering if that else statement acts like their indention implies they think it does.
The code might be interpreted more like:
If (X) then
if Cnt < 4 then
Cnt = Cnt + 1 ; 4 samples
Else
if Cnt > 0 then
Cnt = Cnt-1 ; keep history
EndIf
In which Cnt get to 4 and then toggles on/off if X is true.
This is basic as compiled using BCI51. That's a basic compiler for an 8051 from back in 1990 by Systronix.
How do nested if-else pairs get resolved in basic?
I remember how QBasic did so, and I'm going to assume that this complier is doing the same. This is really tugging on my memory, so I might be wrong.
If a IF THEN is followed by code on the same line, then it is fully contained. Therefore
if Cnt < 4 then Cnt = Cnt + 1
else
...
would be illegal and you must place the Cnt = Cnt + 1 on it's own line to create a multi-line IF statement. Therefore, the ELSE is paired the topmost IF
Since, in the original code, the Cnt = Cnt + 1 and Cnt = Cnt - 1 are on the same lines as the IF THEN, I would interpret the code as follows:
If (X) then
If Cnt < 4 Then
Cnt = Cnt + 1 ; 4 samples
EndIf
Else
If Cnt > 0 Then
Cnt = Cnt-1 ; keep history
EndIf
EndIf
So, yes, I believe the code operates as the indentation implies.
Are you able to modify the code and test if you see any changes?

Compatible SQL function for Excel FDist

Does anyone know that is there a compatible function in SQL for Excel FDIST and FINV? If there is no, anyone has any idea how to build that? May be in C#?
Thanks. Your help is greatly appreciated.
I have managed to resolve my problems by using a library from .Net Framework 4.0 and above (System.Windows.Forms.DataVisualization.Charting.StatisticFormula).
I am able to develop a function in C# using the above library for my calculation process. This is a powerful library where you can find mostly common use statistical formula in there (e.g. mean, median, t distribution, f distribution, and inverse of them.)
Below are the code snippet from me:
using System.Windows.Forms.DataVisualization.Charting;
private Chart ch = new Chart(); // You will need to declare an object of Chart type, as Statistic Formula class does not have a public constructor
double fDist = ch.DataManipulator.Statistics.FDistribution(fRatioVariance, degreeFreedom1, degreeFreedom2);
Hope this will help others. Thanks.
Though its too late but below are some statistical function implementation in SQL Server itself.
To get FDist function (equivalent to Excel - FDist), we will be needing Complete and Incomplete beta function as well as gamma function:
--GAMMA Function
CREATE FUNCTION [dbo].[udf_Gamma]
(
#x Float=NULL
)
RETURNS Float
AS
BEGIN
Declare #f Float = 10E99;
Declare #g Float = 1;
if ( #x > 0 )
Begin
while (#x < 3)
Begin
SET #g = #g * #x;
SET #x = #x + 1;
End
SET #f = (1 - (2/(7*power(#x,2))) * (1 - 2/(3*power(#x,2))))/(30*power(#x,2));
SET #f = (1-#f)/(12*#x) + #x*(log(#x)-1);
SET #f = (exp(#f)/#g)*power(2*PI()/#x,0.5);
End
else
Begin
SET #f = 10E99
End
return #f;
END
--BETA Complete Function
CREATE FUNCTION [dbo].[udf_BetaC]
(
#x Float=NULL
,#a Float=NULL
,#b Float=NULL
)
RETURNS Float
AS
BEGIN
--double betacf(double a,double b,double x){
Declare #maxIterations int = 50, #m int =1
Declare #eps Float = 3E-5
Declare #am Float = 1;
Declare #bm Float = 1;
Declare #az Float = 1;
Declare #qab Float = #a+#b;
Declare #qap Float = #a+1;
Declare #qam Float = #a-1;
Declare #bz Float = 1 - #qab*#x/#qap;
Declare #aold Float = 0;
Declare #em Float, #tem Float, #d Float, #ap Float, #bp Float, #app Float, #bpp Float;
while((#m<#maxIterations) AND (abs(#az-#aold)>=#eps*abs(#az)))
Begin
SET #em = #m;
SET #tem = #em+#em;
SET #d = #em*(#b-#m)*#x/((#qam + #tem)*(#a+#tem));
SET #ap = #az+#d*#am;
SET #bp = #bz+#d*#bm;
SET #d = -(#a+#em)*(#qab+#em)*#x/((#a+#tem)*(#qap+#tem));
SET #app = #ap+#d*#az;
SET #bpp = #bp+#d*#bz;
SET #aold = #az;
SET #am = #ap/#bpp;
SET #bm = #bp/#bpp;
SET #az = #app/#bpp;
SET #bz = 1;
SET #m = #m + 1;
End
return #az
END
--BETA INCOMPLETE Function
CREATE FUNCTION [dbo].[udf_BetaI]
(
#x Float=null
,#a Float=null
,#b Float=null
)
RETURNS Float
AS
BEGIN
Declare #bt Float=0.0
Declare #beta Float=0.0
if( #x=0 OR #x=1 )
Begin
SET #bt = 0
End
else if((#x>0) AND (#x<1))
Begin
SET #bt = (Select dbo.UDF_Gamma(#a+#b)* power(#x,#a)* power(1-#x,#b)/(dbo.UDF_Gamma(#a)*dbo.UDF_Gamma(#b)) )
End
if(#x<(#a+1)/(#a+#b+2))
Begin
SET #beta = (Select #bt*dbo.udf_betaC(#x,#a,#b)/#a)
End
else
Begin
SET #beta = (Select 1-#bt*dbo.udf_betaC(1-#x,#b,#a)/#b)
End
Return #beta
END
--FDist Function
CREATE FUNCTION [dbo].[udf_FDist]
(
#x Float=NULL
,#df1 Float=NULL
,#df2 Float=NULL
)
RETURNS Float
AS
BEGIN
Declare #x1 Float=(#x*#df1)/((#x*#df1)+#df2)
return (select 1 - dbo.udf_BetaI(#x1,(#df1/2),(#df2/2)))
END
Check in Excel =FDIST(0.5,1,1)=0.608173448
and in SQL editor = SELECT udf_FDIST(0.5,1,1)=0.608173457369209
Regards,
Avi
FDIST an FINV Don't exists IN Sql Server.
You can write a SQL Sever function to implement two excel feature.
Show here to create function
in MS SQL Server there is standard_deviation
and some more statistical functions including variance.
There is also a million workarounds, for example: http://oreilly.com/catalog/transqlcook/chapter/ch08.html
,you need to dig deep into the math for these, though.

SQLite full-text search relevance ranking

I am using the fts4 extension of sqlite3 to enable full-text indexing and searching of text data. This it working great, but I've noticed that the results are not relevance-ranked at all. I guess I am too used to Lucene. I've seen some brief suggestions to write a custom rank method using the matchinfo() results, but it's not clear to me how this is done, or whether there are any sophisticated examples out there. How have others dealt with this?
There's a complete example in the documentation, look at the end of appendix a. You'll need to do slightly more work to get a good relevance ranking as the function provided is good only for getting started. For example, with matchinfo(table,'pcnalx') there's enough information to implement Okapi BM25.
There seems to be a distinct lack of documentation on how to implement Okapi BM25 in C and it seems it is an unspoken thing that the implementation is left as an exercise for the user.
Well I found the bro of a programmer "Radford 'rads' Smith" who chucked this up on GitHub
https://github.com/rads/sqlite-okapi-bm25
It only implements BM25 although I'm looking into BM25F tweaks now....
....and here it is.
https://github.com/neozenith/sqlite-okapi-bm25
For FTS5, according to SQLite FTS5 Extension,
FTS5 has no matchinfo().
FTS5 supports ORDER BY rank
So very simply, something like
SELECT * FROM email WHERE email MATCH 'fts5' ORDER BY rank;
without DESC works.
Here is an implementation of Okapi BM25. Using this in combination with the suggestions at SQLite.org will help you generate a relevance-ranked MATCH query. This was written all in VB.Net and the query was called using System.Data.SQLite functions. The custom SQLiteFunction at the end can be called from the SQL code without issue, as long as the SQL code is called using System.Data.SQLite functions.
Public Class MatchInfo
Property matchablePhrases As Integer
Property userDefinedColumns As Integer
Property totalDocuments As Integer
Private _int32HitData As List(Of Integer)
Private _longestSubsequencePhraseMatches As New List(Of Integer)
Private _tokensInDocument As New List(Of Integer)
Private _averageTokensInDocument As New List(Of Integer)
Private _max_hits_this_row As Integer?
Public ReadOnly Property max_hits_this_row As Integer
Get
If _max_hits_this_row Is Nothing Then
_max_hits_this_row = 0
For p = 0 To matchablePhrases - 1
For c = 0 To userDefinedColumns - 1
Dim myHitsThisRow As Integer = hits_this_row(p, c)
If myHitsThisRow > _max_hits_this_row Then
_max_hits_this_row = myHitsThisRow
End If
Next
Next
End If
Return _max_hits_this_row
End Get
End Property
Private _max_hits_all_rows As Integer?
Public ReadOnly Property max_hits_all_rows As Integer
Get
If _max_hits_all_rows Is Nothing Then
_max_hits_all_rows = 0
For p = 0 To matchablePhrases - 1
For c = 0 To userDefinedColumns - 1
Dim myHitsAllRows As Integer = hits_all_rows(p, c)
If myHitsAllRows > _max_hits_all_rows Then
_max_hits_all_rows = myHitsAllRows
End If
Next
Next
End If
Return _max_hits_all_rows
End Get
End Property
Private _max_docs_with_hits As Integer?
Public ReadOnly Property max_docs_with_hits As Integer
Get
If _max_docs_with_hits Is Nothing Then
_max_docs_with_hits = 0
For p = 0 To matchablePhrases - 1
For c = 0 To userDefinedColumns - 1
Dim myDocsWithHits As Integer = docs_with_hits(p, c)
If myDocsWithHits > _max_docs_with_hits Then
_max_docs_with_hits = myDocsWithHits
End If
Next
Next
End If
Return _max_docs_with_hits
End Get
End Property
Private _BM25Rank As Double?
Public ReadOnly Property BM25Rank As Double
Get
If _BM25Rank Is Nothing Then
_BM25Rank = 0
'calculate BM25 Rank
'http://en.wikipedia.org/wiki/Okapi_BM25
'k1, calibrates the document term frequency scaling. Having k1 as 0 corresponds to a binary model – no term frequency. Increasing k1 will give rare words more boost.
'b, calibrates the scaling by document length, and can take values from 0 to 1, where having 0 means no length normalization and having 1 corresponds to fully scaling the term weight by the document length.
Dim k1 As Double = 1.2
Dim b As Double = 0.75
For column = 0 To userDefinedColumns - 1
For phrase = 0 To matchablePhrases - 1
Dim IDF As Double = Math.Log((totalDocuments - hits_all_rows(phrase, column) + 0.5) / (hits_all_rows(phrase, column) + 0.5))
Dim score As Double = (IDF * ((hits_this_row(phrase, column) * (k1 + 1)) / (hits_this_row(phrase, column) + k1 * (1 - b + b * _tokensInDocument(column) / _averageTokensInDocument(column)))))
If score < 0 Then
score = 0
End If
_BM25Rank += score
Next
Next
End If
Return _BM25Rank
End Get
End Property
Public Sub New(raw_pcnalsx_MatchInfo As Byte())
Dim int32_pcsx_MatchInfo As New List(Of Integer)
For i = 0 To raw_pcnalsx_MatchInfo.Length - 1 Step 4
int32_pcsx_MatchInfo.Add(BitConverter.ToUInt32(raw_pcnalsx_MatchInfo, i))
Next
'take the raw data and parse it out
Me.matchablePhrases = int32_pcsx_MatchInfo(0)
int32_pcsx_MatchInfo.RemoveAt(0)
Me.userDefinedColumns = int32_pcsx_MatchInfo(0)
int32_pcsx_MatchInfo.RemoveAt(0)
Me.totalDocuments = int32_pcsx_MatchInfo(0)
int32_pcsx_MatchInfo.RemoveAt(0)
'remember that the columns are 0-based
For i = 0 To userDefinedColumns - 1
_averageTokensInDocument.Add(int32_pcsx_MatchInfo(0))
int32_pcsx_MatchInfo.RemoveAt(0)
Next
For i = 0 To userDefinedColumns - 1
_tokensInDocument.Add(int32_pcsx_MatchInfo(0))
int32_pcsx_MatchInfo.RemoveAt(0)
Next
For i = 0 To userDefinedColumns - 1
_longestSubsequencePhraseMatches.Add(int32_pcsx_MatchInfo(0))
int32_pcsx_MatchInfo.RemoveAt(0)
Next
_int32HitData = New List(Of Integer)(int32_pcsx_MatchInfo)
End Sub
Public Function hits_this_row(phrase As Integer, column As Integer) As Integer
Return _int32HitData(3 * (column + phrase * userDefinedColumns) + 0)
End Function
Public Function hits_all_rows(phrase As Integer, column As Integer) As Integer
Return _int32HitData(3 * (column + phrase * userDefinedColumns) + 1)
End Function
Public Function docs_with_hits(phrase As Integer, column As Integer) As Integer
Return _int32HitData(3 * (column + phrase * userDefinedColumns) + 2)
End Function
End Class
<SQLiteFunction("Rank", 1, FunctionType.Scalar)>
Public Class Rank
Inherits SQLiteFunction
Public Overrides Function Invoke(args() As Object) As Object
Return New MatchInfo(args(0)).BM25Rank
End Function
End Class

Resources