Skip to content

Commit 276be34

Browse files
committed
Changed the rules for NGC objects
1 parent e6a6d3c commit 276be34

File tree

2 files changed

+101
-19
lines changed

2 files changed

+101
-19
lines changed

contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,11 @@ public void setUp() throws Exception {
247247
assertU(adoc("id", "150", "bibcode", "xxxxxxxxxx150", "title", "nag5-abcd"));
248248
assertU(adoc("id", "151", "bibcode", "xxxxxxxxxx151", "title", "nag5abcd"));
249249
assertU(adoc("id", "152", "bibcode", "xxxxxxxxxx152", "title", "nag5 abcd"));
250+
assertU(adoc("id", "153", "bibcode", "xxxxxxxxxx153", "title", "NGC 1"));
251+
assertU(adoc("id", "154", "bibcode", "xxxxxxxxxx154", "title", "NGC-1"));
252+
assertU(adoc("id", "155", "bibcode", "xxxxxxxxxx155", "title", "N-1"));
253+
assertU(adoc("id", "156", "bibcode", "xxxxxxxxxx156", "title", "N 1"));
254+
assertU(adoc("id", "157", "bibcode", "xxxxxxxxxx157", "title", "NGC1"));
250255

251256
assertU(adoc("id", "318", "bibcode", "xxxxxxxxxx318", "title", "creation of a thesaurus", "pub", "creation of a thesaurus"));
252257
assertU(adoc("id", "382", "bibcode", "xxxxxxxxxx382", "title", "xhtml <tags> should be <SUB>fooxx</SUB> <xremoved>"));
@@ -970,7 +975,96 @@ public void testSynonyms() throws Exception {
970975
}
971976

972977
public void testOtherCases() throws Exception {
978+
979+
// change to NGC tokenizer in the schema; we want to index both
980+
// variants, but during search time only query for the concat version
981+
982+
assertQ(req("q", "title" + ":NGC"),
983+
"//*[@numFound='4']",
984+
"//doc/str[@name='id'][.='153']", //NGC 1
985+
"//doc/str[@name='id'][.='154']", //NGC-1
986+
"//doc/str[@name='id'][.='155']", //N-1
987+
"//doc/str[@name='id'][.='156']" //N 1
988+
//"//doc/str[@name='id'][.='157']" //NGC1
989+
);
990+
991+
assertQueryEquals(req("q", "title:\"NGC 1\"", "defType", "aqp"),
992+
"title:acr::ngc1",
993+
TermQuery.class);
994+
assertQ(req("q", "title" + ":NGC 1", "indent", "true"),
995+
"//*[@numFound='5']",
996+
"//doc/str[@name='id'][.='153']",
997+
"//doc/str[@name='id'][.='154']",
998+
"//doc/str[@name='id'][.='155']",
999+
"//doc/str[@name='id'][.='156']",
1000+
"//doc/str[@name='id'][.='157']"
1001+
);
1002+
1003+
1004+
assertQueryEquals(req("q", "title:\"NGC-1\"", "defType", "aqp"),
1005+
"title:acr::ngc1",
1006+
TermQuery.class);
1007+
assertQ(req("q", "title" + ":NGC-1"),
1008+
"//*[@numFound='5']",
1009+
"//doc/str[@name='id'][.='153']",
1010+
"//doc/str[@name='id'][.='154']",
1011+
"//doc/str[@name='id'][.='155']",
1012+
"//doc/str[@name='id'][.='156']",
1013+
"//doc/str[@name='id'][.='157']" //NGC1
1014+
);
1015+
1016+
assertQueryEquals(req("q", "title:\"N-1\"", "defType", "aqp"),
1017+
"title:n1",
1018+
TermQuery.class);
1019+
assertQ(req("q", "title" + ":N-1"),
1020+
"//*[@numFound='2']",
1021+
"//doc/str[@name='id'][.!='153']",
1022+
"//doc/str[@name='id'][.!='154']",
1023+
"//doc/str[@name='id'][.='155']",
1024+
"//doc/str[@name='id'][.='156']",
1025+
"//doc/str[@name='id'][.!='157']"
1026+
);
1027+
1028+
// this finds 0 because during indexing, we'd turn the two
1029+
// tokens into 'n1' - and this search
1030+
assertQueryEquals(req("q", "title:\"N 1\"", "defType", "aqp"),
1031+
"title:n1",
1032+
TermQuery.class);
1033+
assertQ(req("q", "title" + ":\"N 1\""),
1034+
"//*[@numFound='2']",
1035+
"//doc/str[@name='id'][.!='153']",
1036+
"//doc/str[@name='id'][.!='154']",
1037+
"//doc/str[@name='id'][.='155']",
1038+
"//doc/str[@name='id'][.='156']",
1039+
"//doc/str[@name='id'][.!='157']" //NGC1
1040+
);
1041+
1042+
assertQueryEquals(req("q", "title:\"NGC1\"", "defType", "aqp"),
1043+
"title:acr::ngc1",
1044+
TermQuery.class);
1045+
assertQ(req("q", "title" + ":NGC1"),
1046+
"//*[@numFound='5']",
1047+
"//doc/str[@name='id'][.='153']",
1048+
"//doc/str[@name='id'][.='154']",
1049+
"//doc/str[@name='id'][.='155']",
1050+
"//doc/str[@name='id'][.='156']",
1051+
"//doc/str[@name='id'][.='157']"
1052+
);
9731053

1054+
assertQueryEquals(req("q", "=title:\"NGC 1\"", "defType", "aqp"),
1055+
"title:\"acr::ngc 1\"",
1056+
PhraseQuery.class);
1057+
assertQ(req("q", "=title" + ":NGC 1"),
1058+
"//*[@numFound='4']",
1059+
"//doc/str[@name='id'][.='153']",
1060+
"//doc/str[@name='id'][.='154']",
1061+
"//doc/str[@name='id'][.='155']",
1062+
"//doc/str[@name='id'][.='156']",
1063+
"//doc/str[@name='id'][.!='157']"
1064+
);
1065+
1066+
1067+
9741068
// #147 - parsing of WDDF tokens
9751069
// analyzer operation. eg. XXX-YYYY => (XXX AND YYY) OR XXXYYY
9761070
assertQueryEquals(req("q", "NAG5-ABCD", "defType", "aqp"),

contrib/examples/adsabs/server/solr/collection1/conf/schema.xml

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@
415415
<charFilter class="solr.PatternReplaceCharFilterFactory"
416416
pattern="\b(?i:(ABELL)(-|\s+)([0-9]+[A-Z]*))\b" replacement="$1-$3 A$3" />
417417
<charFilter class="solr.PatternReplaceCharFilterFactory"
418-
pattern="\b(?i:(NGC)(-|\s+)([0-9]+[A-Z]*))\b" replacement="NGC$3" />
418+
pattern="\b(?i:(N)(-|\s+)([0-9]+[A-Z]*))\b" replacement="NGC-$3 N$3" />
419419
<charFilter class="solr.PatternReplaceCharFilterFactory"
420420
pattern="\b(?i:([34]CR?|ADS|H[DHR]|IC|[MW]|MKN|NGC|PKS|PSR[BJ]?|SAO|UGC|UT)(-|\s+)([0-9]+[A-Z]*))\b"
421421
replacement="$1-$3" />
@@ -490,18 +490,18 @@
490490
<filter class="solr.TrimFilterFactory" />
491491
<filter class="solr.LowerCaseFilterFactory" />
492492

493-
<!-- <filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="indexer"/> -->
493+
<!-- <filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="indexer"/> //-->
494494
</analyzer>
495495
<analyzer type="query">
496496
<charFilter class="solr.HTMLStripCharFilterFactory"/>
497497
<!-- AA: as above, but we only have one canonical replacement for the
498498
expression -->
499499
<charFilter class="solr.PatternReplaceCharFilterFactory"
500-
pattern="\b(?i:(MESSIER)(-|\s+)([0-9]+[A-Z]*))\b" replacement="M$3" />
500+
pattern="\b(?i:(MESSIER)(-|\s+)([0-9]+[A-Z]*))\b" replacement="$1$3" />
501501
<charFilter class="solr.PatternReplaceCharFilterFactory"
502-
pattern="\b(?i:(ABELL)(-|\s+)([0-9]+[A-Z]*))\b" replacement="A$3" />
502+
pattern="\b(?i:(ABELL)(-|\s+)([0-9]+[A-Z]*))\b" replacement="$1$3" />
503503
<charFilter class="solr.PatternReplaceCharFilterFactory"
504-
pattern="\b(?i:(NGC)(-|\s+)([0-9]+[A-Z]*))\b" replacement="NGC$3" />
504+
pattern="\b(?i:(NGC|N)(-|\s+)([0-9]+[A-Z]*))\b" replacement="$1$3" />
505505
<charFilter class="solr.PatternReplaceCharFilterFactory"
506506
pattern="\b(?i:([34]CR?|ADS|H[DHR]|IC|[MW]|MKN|NGC|PKS|PSR[BJ]?|SAO|UGC|UT)(-|\s+)([0-9]+[A-Z]*))\b"
507507
replacement="$1$3" />
@@ -522,7 +522,7 @@
522522
splitOnNumerics="0" stemEnglishPossessive="1" preserveOriginal="0"
523523
/>
524524

525-
<!-- <filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="split"/> -->
525+
<!-- <filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="query:split"/> -->
526526

527527
<!-- lowercase words, but keep ACRONYMS case ie. MOND => MOND Mond =>
528528
mond Hubble Space Telescope => hubble space telescope -->
@@ -599,19 +599,7 @@
599599
<fieldType name="ads_text_nosyn" class="solr.TextField">
600600
<analyzer type="query">
601601
<charFilter class="solr.HTMLStripCharFilterFactory"/>
602-
<!-- AA: as above, but we only have one canonical replacement for the
603-
expression -->
604-
<charFilter class="solr.PatternReplaceCharFilterFactory"
605-
pattern="\b(?i:(MESSIER)(-|\s+)([0-9]+[A-Z]*))\b" replacement="M$3" />
606-
<charFilter class="solr.PatternReplaceCharFilterFactory"
607-
pattern="\b(?i:(ABELL)(-|\s+)([0-9]+[A-Z]*))\b" replacement="A$3" />
608-
<charFilter class="solr.PatternReplaceCharFilterFactory"
609-
pattern="\b(?i:(NGC)(-|\s+)([0-9]+[A-Z]*))\b" replacement="NGC$3" />
610-
<charFilter class="solr.PatternReplaceCharFilterFactory"
611-
pattern="\b(?i:([34]CR?|ADS|H[DHR]|IC|[MW]|MKN|NGC|PKS|PSR[BJ]?|SAO|UGC|UT)(-|\s+)([0-9]+[A-Z]*))\b"
612-
replacement="$1$3" />
613-
614-
602+
615603
<!-- tokenize on empty space (if it is not a hyphen connecting other
616604
words) -->
617605
<tokenizer class="solr.PatternTokenizerFactory" pattern="(?&lt;![-\s])\s+(?!-)"

0 commit comments

Comments
 (0)