Skip to content
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation
This project
Loading...
Sign in
石磊
/
cihai
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit e741f194
authored
Dec 06, 2017
by
石头
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge remote-tracking branch 'origin/master'
2 parents
83564d06
ceef850a
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
149 additions
and
36 deletions
serverside/cihai-core/src/main/java/com/dookay/cihai/core/aip/AipUtilBean.java
serverside/cihai-core/src/test/java/com/dookay/cihai/core/BaiduDemo.java
serverside/cihai-core/src/test/java/com/dookay/cihai/core/BaiduTest.java
serverside/cihai-core/src/test/resources/application.properties
serverside/cihai-core/src/main/java/com/dookay/cihai/core/aip/AipUtilBean.java
View file @
e741f19
...
...
@@ -25,14 +25,18 @@ import com.dookay.coral.common.core.utils.lang.CollectionUtils;
import
com.dookay.coral.common.core.utils.lang.StringUtils
;
import
lombok.extern.slf4j.Slf4j
;
import
org.apache.commons.collections4.ListUtils
;
import
org.apache.commons.collections4.set.ListOrderedSet
;
import
org.apache.commons.lang3.tuple.ImmutablePair
;
import
org.apache.commons.lang3.tuple.Pair
;
import
org.json.JSONArray
;
import
org.json.JSONException
;
import
org.json.JSONObject
;
import
org.springframework.beans.factory.annotation.Autowired
;
import
org.springframework.data.redis.core.StringRedisTemplate
;
import
org.springframework.stereotype.Component
;
import
java.util.*
;
import
java.util.concurrent.ConcurrentHashMap
;
import
java.util.concurrent.TimeUnit
;
import
java.util.stream.Collectors
;
/**
...
...
@@ -45,13 +49,23 @@ import java.util.stream.Collectors;
@Slf4j
public
final
class
AipUtilBean
{
private
static
final
ConcurrentHashMap
<
String
,
Double
>
SCORE_MAP
=
new
ConcurrentHashMap
<>();
private
static
final
String
SCORE_KEY_PREFIX
=
"WORD_SCORE:"
;
private
static
final
double
CRITICAL_VALUE
=
0.4
D
;
/**
* 内部错误
*/
private
static
final
String
INTERNAL_ERROR
=
"282000"
;
private
final
AipNlp
aipNlp
;
private
final
StringRedisTemplate
template
;
@Autowired
private
AipNlp
aipNlp
;
public
AipUtilBean
(
AipNlp
aipNlp
,
StringRedisTemplate
template
)
{
this
.
aipNlp
=
aipNlp
;
this
.
template
=
template
;
}
/**
* 抽取查询关键词
...
...
@@ -123,6 +137,7 @@ public final class AipUtilBean {
/**
* 抽取关键词
*
* @param keyword
* @param document 文档
* @param size 关键词个数
* @return
...
...
@@ -130,28 +145,38 @@ public final class AipUtilBean {
* @author houkun
* @date 2017/12/6
*/
public
List
<
String
>
extractKeyWords
(
String
document
,
int
size
)
throws
JSONException
{
public
List
<
String
>
extractKeyWords
(
String
keyword
,
String
document
,
int
size
)
throws
JSONException
{
List
<
LexerItem
>
lexerItems
=
getLexerItems
(
document
);
Map
<
String
,
Set
<
LexerItem
>>
itemMap
=
lexerItems
.
s
tream
()
List
<
String
>
keywords
=
lexerItems
.
parallelS
tream
()
.
filter
(
l
->
LexerPosConst
.
inThis
(
l
.
getPos
())
||
LexerNeConst
.
inThis
(
l
.
getNe
()))
.
collect
(
Collectors
.
groupingBy
(
LexerItem:
:
getItem
,
Collectors
.
mapping
(
l
->
l
,
Collectors
.
toSet
())));
ListOrderedSet
<
String
>
words
=
new
ListOrderedSet
<>();
// 规则1 优先专有名词
for
(
Map
.
Entry
<
String
,
Set
<
LexerItem
>>
entry
:
itemMap
.
entrySet
())
{
Set
<
LexerItem
>
value
=
entry
.
getValue
();
boolean
isNe
=
value
.
stream
().
anyMatch
(
l
->
LexerNeConst
.
inThis
(
l
.
getNe
()));
if
(
isNe
)
{
words
.
add
(
entry
.
getKey
());
}
}
// 规则2 其他名词按照出现次数
itemMap
.
entrySet
().
stream
().
sorted
(
Comparator
.
comparing
(
e
->
e
.
getValue
().
size
())).
forEach
(
e
->
{
words
.
add
(
e
.
getKey
());
});
return
words
.
stream
().
limit
(
size
).
collect
(
Collectors
.
toList
());
.
map
(
LexerItem:
:
getItem
)
.
distinct
()
.
filter
(
w
->
doSimnet
(
w
,
keyword
)
>
CRITICAL_VALUE
)
.
sorted
(
Comparator
.
comparingDouble
(
word
->
Double
.
parseDouble
(
template
.
opsForValue
().
get
(
getScoreKey
((
String
)
word
,
keyword
)))).
reversed
())
.
limit
(
size
)
.
collect
(
Collectors
.
toList
());
return
keywords
;
}
/**
* 计算关键词相关
*
* @param keyword
* @param words
* @return
* @author houkun
* @date 2017/12/6
*/
public
Map
<
Pair
<
String
,
String
>,
Double
>
calcKeywordsRelated
(
String
keyword
,
List
<
String
>
words
)
{
Map
<
Pair
<
String
,
String
>,
Double
>
map
=
new
HashMap
<>(
words
.
size
());
words
.
parallelStream
()
.
forEach
(
word
->
{
double
score
=
doSimEmbedding
(
keyword
,
word
);
map
.
put
(
new
ImmutablePair
<>(
keyword
,
word
),
score
);
}
);
return
map
;
}
...
...
@@ -172,7 +197,6 @@ public final class AipUtilBean {
log
.
debug
(
s
);
JSONObject
res
=
doLexer
(
s
);
JSONArray
items
=
res
.
getJSONArray
(
"items"
);
// log.debug(items.toString(2));
lexerItems
=
ListUtils
.
union
(
lexerItems
,
JSON
.
parseArray
(
items
.
toString
(),
LexerItem
.
class
));
}
return
lexerItems
;
...
...
@@ -226,6 +250,81 @@ public final class AipUtilBean {
return
res
;
}
/**
* 计算两个词的相关程度
*
* @param s1
* @param s2
* @return
* @throws JSONException
* @author houkun
* @date 2017/12/6
*/
private
double
doSimEmbedding
(
String
s1
,
String
s2
)
{
String
key
=
getScoreKey
(
s1
,
s2
);
String
scoreString
=
template
.
opsForValue
().
get
(
key
);
if
(
StringUtils
.
isNotEmpty
(
scoreString
))
{
return
Double
.
parseDouble
(
scoreString
);
}
try
{
JSONObject
res
=
aipNlp
.
wordSimEmbedding
(
s1
,
s2
);
double
score
=
0
;
boolean
error
=
res
.
has
(
"error_code"
);
if
(
error
)
{
String
errorCode
=
null
;
errorCode
=
res
.
getString
(
"error_code"
);
if
(
INTERNAL_ERROR
.
equals
(
errorCode
))
{
score
=
doSimEmbedding
(
s1
,
s2
);
}
else
{
log
.
warn
(
res
.
toString
());
score
=
doSimnet
(
s1
,
s2
);
}
}
else
{
score
=
res
.
getDouble
(
"score"
);
}
template
.
opsForValue
().
set
(
key
,
String
.
valueOf
(
score
),
1
,
TimeUnit
.
HOURS
);
return
score
;
}
catch
(
JSONException
e
)
{
return
0
;
}
}
/**
* 旧 aip 计算两个词的相关程度
*
* @param s1
* @param s2
* @return
* @throws JSONException
* @author houkun
* @date 2017/12/6
*/
private
double
doSimnet
(
String
s1
,
String
s2
)
{
JSONObject
res
=
aipNlp
.
simnet
(
s1
,
s2
,
new
HashMap
<>(
0
));
double
score
=
0
;
try
{
boolean
error
=
res
.
has
(
"error"
);
if
(
error
)
{
String
errorCode
=
null
;
errorCode
=
res
.
getString
(
"error"
);
if
(
INTERNAL_ERROR
.
equals
(
errorCode
))
{
score
=
doSimnet
(
s1
,
s2
);
}
else
{
throw
new
ServiceException
(
errorCode
);
}
}
else
{
score
=
res
.
getDouble
(
"score"
);
}
return
score
;
}
catch
(
JSONException
e
)
{
return
0
;
}
}
private
String
getScoreKey
(
String
s1
,
String
s2
)
{
return
SCORE_KEY_PREFIX
+
s1
+
":"
+
s2
;
}
/**
* 分割文档
...
...
serverside/cihai-core/src/test/java/com/dookay/cihai/core/BaiduDemo.java
View file @
e741f19
...
...
@@ -48,14 +48,14 @@ public class BaiduDemo {
// System.out.println(res.toString(2));
//
// // 句法
HashMap
<
String
,
Object
>
option
=
new
HashMap
<>();
option
.
put
(
"mode"
,
1
);
JSONObject
des
=
client
.
depParser
(
text
,
option
);
System
.
out
.
println
(
des
.
toString
(
2
));
JSONObject
des1
=
client
.
depParser
(
text1
,
option
);
System
.
out
.
print
(
des1
.
toString
(
2
));
JSONObject
des2
=
client
.
depParser
(
text2
,
option
);
System
.
out
.
print
(
des2
.
toString
(
2
));
//
HashMap<String, Object> option = new HashMap<>();
//
option.put("mode", 1);
//
JSONObject des = client.depParser(text, option);
//
System.out.println(des.toString(2));
//
JSONObject des1 = client.depParser(text1, option);
//
System.out.print(des1.toString(2));
//
JSONObject des2 = client.depParser(text2, option);
//
System.out.print(des2.toString(2));
//
// // 相似
...
...
@@ -64,9 +64,9 @@ public class BaiduDemo {
// JSONObject response = client.simnet("十九大","中国共产党第十九次全国代表大会", option1);
// System.out.println(response.toString(2));
//
// 词向量
// JSONObject vect = client.wordEmbedding("习近平
");
//
System.out.println(vect.toString(2));
// 词向量
JSONObject
vect
=
client
.
wordEmbedding
(
"十九大
"
);
System
.
out
.
println
(
vect
.
toString
(
2
));
//
// // 相似
...
...
serverside/cihai-core/src/test/java/com/dookay/cihai/core/BaiduTest.java
View file @
e741f19
...
...
@@ -16,6 +16,7 @@ package com.dookay.cihai.core;
import
com.alibaba.fastjson.JSON
;
import
com.dookay.cihai.core.aip.AipUtilBean
;
import
org.apache.commons.lang3.tuple.Pair
;
import
org.junit.Assert
;
import
org.junit.Test
;
import
org.springframework.beans.factory.annotation.Autowired
;
...
...
@@ -70,9 +71,16 @@ public class BaiduTest extends CihaiCoreApplicationTests {
File
file
=
resource
.
getFile
();
FileReader
reader
=
new
FileReader
(
file
);
String
text
=
FileCopyUtils
.
copyToString
(
reader
);
List
<
String
>
list
=
aipUtilBean
.
extractKeyWords
(
text
,
10
);
List
<
String
>
list
=
aipUtilBean
.
extractKeyWords
(
"中国共产党第十九次全国代表大会"
,
text
,
15
);
System
.
out
.
println
(
JSON
.
toJSONString
(
list
));
Map
<
String
,
Long
>
map
=
aipUtilBean
.
extractNounWordsWithCount
(
text
);
System
.
out
.
print
(
map
.
toString
());
Map
<
Pair
<
String
,
String
>,
Double
>
map
=
aipUtilBean
.
calcKeywordsRelated
(
"中国共产党第十九次全国代表大会"
,
list
);
for
(
String
s
:
list
)
{
Map
<
Pair
<
String
,
String
>,
Double
>
map1
=
aipUtilBean
.
calcKeywordsRelated
(
s
,
list
);
System
.
out
.
print
(
map1
);
}
System
.
out
.
print
(
map
);
// Map<String, Long> map = aipUtilBean.extractNounWordsWithCount(text);
// System.out.print(map.toString());
}
}
serverside/cihai-core/src/test/resources/application.properties
View file @
e741f19
...
...
@@ -15,7 +15,13 @@ spring.datasource.druid.filter.config.enabled=true
mapper.mappers
=
com.dookay.coral.common.core.persistence.Mapper
mybatis.mapper-
locations
=
classpath*:mapper/*.xml
spring.redis.host
=
192.168.2.27
spring.redis.password
=
100001
aip.app-
id
=
10486245
aip.api-
key
=
ws8qdxT51xm2qbWufxzRedI3
aip.secret-
key
=
8b6g9ZyR69dFl6aqYdIOGa4IbOGgkdjh
logging.level.com.dookay.cihai.core
=
debug
Write
Preview
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment