libstemmer sphinx does not work - sphinx

I have sphinx installed on my vagrant machine with CentOs 6 and i'm trying to install the dutch libstemmer from Snowball.
The installation was executed successfully but the tests goes wrong.
I have create 2 indexes with exactly the same data.
My indexes are:
index shop_products1 {
type = rt
dict = keywords
min_prefix_len = 3
rt_mem_limit = 2046M
path = /var/lib/sphinxsearch/data/shop_products2
morphology = libstemmer_nl, stem_en
html_strip = 1
html_index_attrs = img=alt,title; a=title;
preopen = 1
inplace_enable = 1
index_exact_words = 1
rt_field = name
rt_field = brand
rt_field = description
rt_field = specifications
rt_field = tags
rt_field = ourtags
rt_field = searchfield
rt_field = shop
rt_field = category
rt_field = color
rt_field = ourcolor
rt_field = gender
rt_field = material
rt_field = ean
rt_field = sku
rt_attr_string = ean
rt_attr_string = sku
rt_attr_float = price
rt_attr_float = discount
rt_attr_uint = shopid
rt_attr_uint = itemid
rt_attr_uint = deleted
rt_attr_uint = duplicate
rt_attr_uint = brandid
rt_attr_uint = duplicates
rt_attr_timestamp = updated_at
}
index shop_products2 {
type = rt
dict = keywords
min_prefix_len = 3
rt_mem_limit = 2046M
path = /var/lib/sphinxsearch/data/shop_products20
html_strip = 1
html_index_attrs = img=alt,title; a=title;
preopen = 1
inplace_enable = 1
index_exact_words = 1
rt_field = name
rt_field = brand
rt_field = description
rt_field = specifications
rt_field = tags
rt_field = ourtags
rt_field = searchfield
rt_field = shop
rt_field = category
rt_field = color
rt_field = ourcolor
rt_field = gender
rt_field = material
rt_field = ean
rt_field = sku
rt_attr_string = ean
rt_attr_string = sku
rt_attr_float = price
rt_attr_float = discount
rt_attr_uint = shopid
rt_attr_uint = itemid
rt_attr_uint = deleted
rt_attr_uint = duplicate
rt_attr_uint = brandid
rt_attr_uint = duplicates
rt_attr_timestamp = updated_at
}
searchd {
listen = 127.0.0.1:9306:mysql41
log = /var/log/sphinxsearch/searchd.log
workers = threads
binlog_path = /var/lib/sphinxsearch/rt-binlog
read_timeout = 5
client_timeout = 200
max_children = 0
# 2 hours
rt_flush_period = 7200
pid_file = /var/run/searchd.pid
}
When i search for example the dutch word "afzuigkappen" it has to give the exact same results as "afzuigkap"
Can someone give me some information about how to get this work please?
Ps. sorry for my bad english..

The Dutch stemmer in snowball stems afzuigkappen and afzuigkap differently:
afzuigkappen -> afzuigkapp
afzuigkap -> afzuigkap
So you should update the stemmer algorithm in order to attend your objective, documentation about the algorithm here

Alright, I have created some specific tests.
My index i've created:
index test1 {
type = rt
dict = keywords
min_prefix_len = 3
rt_mem_limit = 2046M
morphology = libstemmer_nl, stem_en
path = /var/lib/sphinxsearch/data/test1
preopen = 1
inplace_enable = 1
index_exact_words = 1
rt_field = name
rt_attr_uint = shopid
rt_attr_uint = itemid
}
index test2 {
type = rt
dict = keywords
min_prefix_len = 3
rt_mem_limit = 2046M
path = /var/lib/sphinxsearch/data/test2
preopen = 1
inplace_enable = 1
index_exact_words = 1
rt_field = name
rt_attr_uint = shopid
rt_attr_uint = itemid
}
I indexed with a smaller database with football products and searched with sphinx as results: http://imgur.com/n95Ue8v
As you see both give the same output with 53 records. If i search just in my mysql : select * from tests1 WHERE name LIKE '%keeper%' i got 360 results.

Related

how to set realtime index attribute type

I have an index where I would like insert, update and delete rows with realtime index but a index_rt wrong works.
It's a common index config:
index jobResumeIndex
{
source = jobResumeSource
path = {{path_to_data}}/{{data_file_name}}/jobResumeIndex
morphology = stem_enru
charset_table = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F, U+401->U+0435, U+451->U+0435
min_prefix_len = 3
#min_infix_len = 3
index_exact_words = 1
expand_keywords = 1
}
and it is a real time config:
index jobResumeRT
{
type = rt
source = jobResumeSource
path = {{path_to_data}}/{{data_file_name}}/jobResume
#Список полей для записи
rt_field = post
rt_field = wage
rt_field = currency_id
rt_field = tariff_rate_id
rt_field = business_trip_id
rt_field = work_experience_id
rt_field = citizenship_id
rt_field = geo_place_id
rt_field = age
rt_field = gender
rt_field = only_with_avatar
rt_field = only_with_portfolio
rt_field = only_with_wage
rt_field = visible
rt_field = status
rt_field = updated_at
rt_field = about_me
rt_field = fio
rt_field = work_permit_ids
rt_field = prof_area_ids
rt_field = driver_license_ids
rt_field = skills
rt_field = employment_ids
rt_field = schedule_ids
rt_field = education_ids
rt_field = contacts
rt_field = language_codes
rt_field = experience_text
rt_field = portfolio_text
rt_field = course_text
rt_attr_string = post
rt_attr_uint = wage
rt_attr_uint = currency_id
rt_attr_uint = tariff_rate_id
rt_attr_uint = business_trip_id
rt_attr_uint = work_experience_id
rt_attr_uint = citizenship_id
rt_attr_uint = geo_place_id
rt_attr_uint = age
rt_attr_uint = gender
rt_attr_uint = only_with_avatar
rt_attr_uint = only_with_portfolio
rt_attr_uint = only_with_wage
rt_attr_uint = visible
rt_attr_uint = status
rt_attr_uint = updated_at
rt_attr_string = about_me
rt_attr_string = fio
rt_attr_string = work_permit_ids
rt_attr_string = prof_area_ids
rt_attr_string = driver_license_ids
rt_attr_string = skills
rt_attr_string = employment_ids
rt_attr_string = schedule_ids
rt_attr_string = education_ids
rt_attr_string = contacts
rt_attr_string = language_codes
rt_attr_string = experience_text
rt_attr_string = portfolio_text
rt_attr_string = course_text
rt_mem_limit = 512M
morphology = stem_enru
charset_table = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F, U+401->U+0435, U+451->U+0435
min_prefix_len = 3
index_exact_words = 1
expand_keywords = 1
}
and it's here combine both indexs in one:
index jobResume
{
type = distributed
local = jobResumeIndex
local = jobResumeRT
}
then when I trying insert new record:
mysql> insert into jobResumeRT values(119,'Инженер - инспектор по безопасности полетов',0,190,193,7,15,0,538560,'14',1,1,0,0,1,2,'1575616137','','Иван Иванов ','','12,211','','{1057369: PHP (PHP4, PHP5, PHP5.5, HPHP)}','1','10','1234567898','','','','','');
ERROR 1064 (42000): row 1, column 3: string expected
third field currency_id must be an integer type, why string expected? I don't understand(
Firstly RT indexes DON'T have a 'source'. They contain the data directly, not loading from a remote source. It might not be an erorr to specify source=, but will be ignored
RT indexes create their schema, from the rt_field and the rt_attr_* directives.
As such the schema won't be the same as the local/disk index. Will usually be same order as defined in index defintion, but can vary (if index has undergone edits)
... best to run a DESCRIBE jobResumeRT to find the actual order of all the columns in the index. Then when doing an INSERT etc without naming the columns, insert the columns in same order as returned from DESCRIBE.
Or do an insert, by naming the columns in the command, in same order. This may actually be better, as can then insert into a string attribute and field at once.

How to create a select clause with AND query that matches tx_news items with two or more assotiated categories

I try to create a query to get all news items, that are flagged with at least two different categories and they have to match the AND clause.
I need this query to make a decision, if the following code should be rendered, or not. E.g. If there is no news item with category A and category B, do nothing. Else show tx_news LIST view.
lib.field_dmnewsplugin.5 = CONTENT
lib.field_dmnewsplugin.5 {
table = tx_news_domain_model_news
select {
pidInList = 124
max = 9
orderBy = uid DESC
leftjoin = sys_category_record_mm ON (sys_category_record_mm.uid_foreign = tx_news_domain_model_news.uid)
#andWhere = sys_category_record_mm.uid_local IN (14,16)
#where = sys_category_record_mm.uid_local = 14
andWhere = sys_category_record_mm.uid_local = 14 AND sys_category_record_mm.uid_local = 16
}
renderObj = COA
renderObj {
1 = TEXT
1.value = Aktuelles
1.wrap = <h2>|</h2>
2 = TEXT
2.field = title
2.crop = 50|...|1
2.wrap = <h3>|</h3>
3 = TEXT
3.field = teaser
3.crop = 500|...|1
3.wrap = <p>|</p>
}
}
My code is the result of some testings. With the "andWhere" clause, the result is empty. without any where clause, I get double entries for all news items, because all of them have at least two different categories.
My goal is to get unique results for each news item, that is flagged with category A and category B (and maybe as an universal solution additional categories).
What do I have to do?
Thank you in advance,
Ralf
Try to put the WHERE clause into the ON part of the JOIN and use a groupBy to get a counter.
select {
selectFields = count(*) AS counter
leftjoin = sys_category_record_mm ON (sys_category_record_mm.uid_foreign = tx_news_domain_model_news.uid) AND sys_category_record_mm.uid_local IN (14,16)
pidInList = 124
max = 9
groupBy = uid
orderBy = uid DESC
where = counter > 1
}
After I had to realize, that Jo's solution does not work for me, I had another idea:
lib.field_dmnewsplugin = COA
lib.field_dmnewsplugin {
10 = CONTENT
10 {
table = tx_news_domain_model_news
select {
selectFields = title, teaser, count(uid) AS counter
leftjoin = sys_category_record_mm ON (sys_category_record_mm.uid_foreign = tx_news_domain_model_news.uid) AND sys_category_record_mm.uid_local IN ({14,###maincat###)
pidInList = 124
max = 1
groupBy = uid
orderBy = counter DESC, crdate DESC
#where = counter > 1
markers {
maincat.value = 16
}
}
renderObj = COA
renderObj {
10 = COA
10 {
stdWrap {
if {
value = 1
isGreaterThan.data = field:counter
#equals.data = field:counter
}
required = 1
wrap = <h2>Some Headline</h2>
}
10 = USER
10 {
userFunc = TYPO3\CMS\Extbase\Core\Bootstrap->run
extensionName = News
pluginName = Pi1
vendorName = GeorgRinger
switchableControllerActions {
News {
1 = list
}
}
settings < plugin.tx_news.settings
settings {
cropMaxCharacters = 164 | [...] | 1
categoryConjunction = and
categories = 14,16
excludeAlreadyDisplayedNews = 1
archiveRestriction = active
[...]
The problem is, that we cannot use the alias "counter" in the where clause and I have no idea, how I can solve the problem with typoscript. With native SQL there might be a better way.
But I'm able to get the value of "counter" to create an "if" rule. And additionally, I can sort the query by "counter". So, if the query returns at least one hit with "counter" greater than 1, I can decide to render a COA-Object like a news list view with headline.
I'm satisfied with this solution. But maybe, somebody has a special trick for me?
Thank's for your help,
Ralf

sphinxsearch does not return me content and title fields

I am using sphinx search with document table. I want to fetch all the fields but it doesn't return me all the fields. Please check my config file and let me know where to make change? using test index
table : documents
Fields: id, group_id, group_id2, date_added, content, title
mysql> select * from test1 where match ('my document');
+------+----------+------------+
| id | group_id | date_added |
+------+----------+------------+
| 1 | 1 | 1461672351 |
| 2 | 1 | 1461672351 |
+------+----------+------------+
2 rows in set (0.00 sec)
mysql>
source src1
{
type = mysql
sql_host = localhost
sql_user = root
sql_pass = india#123
sql_db = test
sql_port = 3306
sql_query = \
SELECT id, group_id, UNIX_TIMESTAMP(date_added) AS date_added, title, content \
FROM documents
sql_attr_uint = group_id
sql_attr_timestamp = date_added
sql_ranged_throttle = 0
sql_query_info = SELECT * FROM documents WHERE id=$id
}
source src1throttled : src1
{
sql_ranged_throttle = 100
}
index test1
{
source = src1
path = /var/lib/sphinxsearch/data/test1
docinfo = extern
dict = keywords
mlock = 0
morphology = none
min_word_len = 1
html_strip = 0
}
index test1stemmed : test1
{
path = /var/lib/sphinxsearch/data/test1stemmed
morphology = stem_en
}
index dist1
{
type = distributed
local = test1
local = test1stemmed
agent = localhost:9313:remote1
agent = localhost:9314:remote2,remote3
agent_connect_timeout = 1000
agent_query_timeout = 3000
}
index rt
{
type = rt
path = /var/lib/sphinxsearch/data/rt
rt_field = title
rt_field = content
rt_attr_uint = gid
}
indexer
{
mem_limit = 128M
}
searchd
{
listen = 9312
listen = 9306:mysql41
log = /var/log/sphinxsearch/searchd.log
query_log = /var/log/sphinxsearch/query.log
read_timeout = 5
client_timeout = 300
max_children = 30
persistent_connections_limit = 30
pid_file = /var/run/sphinxsearch/searchd.pid
seamless_rotate = 1
preopen_indexes = 1
unlink_old = 1
mva_updates_pool = 1M
max_packet_size = 8M
max_filters = 256
max_filter_values = 4096
max_batch_queries = 32
workers = threads # for RT to work
}
common
{
}
No. sphinx does not 'store' Fields. They can't be returned.
Sphinx 'indexes' the fields, but in its internal Inverse-Index format.
... to get data back out of sphinx - use Attributes. In your example index, you've made group_id and date_added as attributes via the sql_attr_*directives.
sql_field_string could be useful, as it makes a column BOTH a Field, AND a string attribute.
I play a lot with Sphinxsearch and I get same result (no title, content).
If you add this rows to the config file, the SELECT * FROM table1; query will return the title and content fields, too:
sql_field_string = title
sql_field_string = content

skipping non-plain index rt (sphinx 2.1.6)

There is the question. Sphinx, version 2.1.6. I used to rt(real time) index, but when indexing display message in koncole:
using config file 'sphinx.conf'...
skipping non-plain index 'rt'...
But at a connection to sphinxbase and write query mysql> desc rt - displays:
+------------+--------+
| Field | Type |
+------------+--------+
| id | bigint |
| id | field |
| first_name | field |
| last_name | field |
+------------+--------+
This is default data?? They do not meet my request. How to work with index rt?
Sphinx.conf.
source database
{
type = mysql
sql_host = 127.0.0.1
sql_user = test
sql_pass = test
sql_db = community
sql_port = 3306
mysql_connect_flags = 32 # enable compression
sql_query_pre = SET NAMES utf8
sql_query_pre = SET SESSION query_cache_type=OFF
}
source rt : database
{
sql_query_range = SELECT MIN(id),MAX(id) FROM mbt_accounts
sql_query = SELECT id AS 'accountId', first_name AS 'fname', last_name AS 'lname' FROM mbt_accounts WHERE id >= 0 AND id<= 1000
sql_range_step = 1000
sql_ranged_throttle = 1000 # milliseconds
}
index rt
{
source = rt
type = rt
path = /etc/sphinxsearch/rtindex
rt_mem_limit = 700M
rt_field = accountId
rt_field = fname
rt_field = lname
rt_attr_string = fname
rt_attr_string = lname
charset_type = utf-8
charset_table = 0..9, A..Z->a..z, _, -, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F, U+401->U+451, U+451
}
searchd
{
listen = localhost:9312 # port for API
listen = localhost:9306:mysql41 #port for a SphinxQL
log = /var/log/sphinxsearch/searchd.log
binlog_path = /var/log/sphinxsearch/
query_log = /var/log/sphinxsearch/query.log
query_log_format = sphinxql
pid_file = /var/run/sphinxsearch/searchd.pid
workers = threads
max_matches = 1000
read_timeout = 5
client_timeout = 300
max_children = 30
max_packet_size = 8M
binlog_flush = 2
binlog_max_log_size = 90M
thread_stack = 8M
expansion_limit = 500
rt_flush_period = 1800
collation_server = utf8_general_ci
compat_sphinxql_magics = 0
prefork_rotation_throttle = 100
}
Thanks.
indexer only works with indexes that have a 'source' - ie plain disk indexesd. ie indexer does the stuff in the source to get the data to create the index.
RT (Real Time) indexes work very differently. indexer is not involved with RT indexes at all. They are handled totally by searchd.
To add data to a RT index, you need to run a bunch of SphinxQL commands (INSERT, UPDATE etc) that actually add the data to the index.
(DESCRIBE works, because searchd knows the 'structure' of the index (you told it via the rt_field etc) - even if never inserted any data)
Ah, I think you are asking why the structure is different. That's probably because the index was probably created before, you modified sphinx.conf. If you change the definiton of a RT index, you need to 'destroy' the index, to allow it be recreated again.
The simplest way is to shutdown searchd, delete the index files, delete the binlog (it no longer relevent) and then restart searchd.
searchd --stopwait
rm /etc/sphinxsearch/rtindex*
rm /path/to/binlog* #(you dont define a path, so it must be the default, which varies)
searchd #(starts searchd again)

SphinxQL infix search

I am trying to use Sphinx RT index for my website. Everything works except infix searching.
index rt
{
type = rt
path = /var/lib/sphinxsearch/data/rt.sph
rt_field = name
rt_field = address
rt_field = keyword
rt_attr_uint = type
min_word_len = 1
min_infix_len = 3
enable_star = 0
html_strip = 0
inplace_enable = 0
charset_type = utf-8
}
I inserted some values to my rt index.
Example:
Insert into rt(id,name,address,keyword,type) values(100,'JohnRambo','Newyork','assassin',1);
Now, when I search for 'JohnRambo', it returns me the correct result. But when I search for 'John' or 'Rambo', it gives me an empty result set.
mysql> select * from rt where match('John');
Empty set (0.00 sec)
Is there anything that I am missing? Any help is much appreciated!
Infix/prefix searching requires dict=keywords on a RT index.