Commit e741f194 by 石头

Merge remote-tracking branch 'origin/master'

2 parents 83564d06 ceef850a
......@@ -25,14 +25,18 @@ import com.dookay.coral.common.core.utils.lang.CollectionUtils;
import com.dookay.coral.common.core.utils.lang.StringUtils;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.ListUtils;
import org.apache.commons.collections4.set.ListOrderedSet;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.redis.core.StringRedisTemplate;
import org.springframework.stereotype.Component;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
/**
......@@ -45,13 +49,23 @@ import java.util.stream.Collectors;
@Slf4j
public final class AipUtilBean {
private static final ConcurrentHashMap<String, Double> SCORE_MAP = new ConcurrentHashMap<>();
private static final String SCORE_KEY_PREFIX = "WORD_SCORE:";
private static final double CRITICAL_VALUE = 0.4D;
/**
* 内部错误
*/
private static final String INTERNAL_ERROR = "282000";
private final AipNlp aipNlp;
private final StringRedisTemplate template;
@Autowired
private AipNlp aipNlp;
public AipUtilBean(AipNlp aipNlp, StringRedisTemplate template) {
this.aipNlp = aipNlp;
this.template = template;
}
/**
* 抽取查询关键词
......@@ -123,6 +137,7 @@ public final class AipUtilBean {
/**
* 抽取关键词
*
* @param keyword
* @param document 文档
* @param size 关键词个数
* @return
......@@ -130,28 +145,38 @@ public final class AipUtilBean {
* @author houkun
* @date 2017/12/6
*/
public List<String> extractKeyWords(String document, int size) throws JSONException {
public List<String> extractKeyWords(String keyword, String document, int size) throws JSONException {
List<LexerItem> lexerItems = getLexerItems(document);
Map<String, Set<LexerItem>> itemMap = lexerItems.stream()
List<String> keywords = lexerItems.parallelStream()
.filter(l -> LexerPosConst.inThis(l.getPos())
|| LexerNeConst.inThis(l.getNe()))
.collect(Collectors.groupingBy(
LexerItem::getItem,
Collectors.mapping(l -> l, Collectors.toSet())));
ListOrderedSet<String> words = new ListOrderedSet<>();
// 规则1 优先专有名词
for (Map.Entry<String, Set<LexerItem>> entry : itemMap.entrySet()) {
Set<LexerItem> value = entry.getValue();
boolean isNe = value.stream().anyMatch(l -> LexerNeConst.inThis(l.getNe()));
if (isNe) {
words.add(entry.getKey());
}
}
// 规则2 其他名词按照出现次数
itemMap.entrySet().stream().sorted(Comparator.comparing(e -> e.getValue().size())).forEach(e -> {
words.add(e.getKey());
});
return words.stream().limit(size).collect(Collectors.toList());
.map(LexerItem::getItem)
.distinct()
.filter(w -> doSimnet(w, keyword) > CRITICAL_VALUE)
.sorted(Comparator.comparingDouble(word -> Double.parseDouble(template.opsForValue().get(getScoreKey((String) word, keyword)))).reversed())
.limit(size)
.collect(Collectors.toList());
return keywords;
}
/**
* 计算关键词相关
*
* @param keyword
* @param words
* @return
* @author houkun
* @date 2017/12/6
*/
public Map<Pair<String, String>, Double> calcKeywordsRelated(String keyword, List<String> words) {
Map<Pair<String, String>, Double> map = new HashMap<>(words.size());
words.parallelStream()
.forEach(word -> {
double score = doSimEmbedding(keyword, word);
map.put(new ImmutablePair<>(keyword, word), score);
}
);
return map;
}
......@@ -172,7 +197,6 @@ public final class AipUtilBean {
log.debug(s);
JSONObject res = doLexer(s);
JSONArray items = res.getJSONArray("items");
// log.debug(items.toString(2));
lexerItems = ListUtils.union(lexerItems, JSON.parseArray(items.toString(), LexerItem.class));
}
return lexerItems;
......@@ -226,6 +250,81 @@ public final class AipUtilBean {
return res;
}
/**
* 计算两个词的相关程度
*
* @param s1
* @param s2
* @return
* @throws JSONException
* @author houkun
* @date 2017/12/6
*/
private double doSimEmbedding(String s1, String s2) {
String key = getScoreKey(s1, s2);
String scoreString = template.opsForValue().get(key);
if (StringUtils.isNotEmpty(scoreString)) {
return Double.parseDouble(scoreString);
}
try {
JSONObject res = aipNlp.wordSimEmbedding(s1, s2);
double score = 0;
boolean error = res.has("error_code");
if (error) {
String errorCode = null;
errorCode = res.getString("error_code");
if (INTERNAL_ERROR.equals(errorCode)) {
score = doSimEmbedding(s1, s2);
} else {
log.warn(res.toString());
score = doSimnet(s1, s2);
}
} else {
score = res.getDouble("score");
}
template.opsForValue().set(key, String.valueOf(score), 1, TimeUnit.HOURS);
return score;
} catch (JSONException e) {
return 0;
}
}
/**
* 旧 aip 计算两个词的相关程度
*
* @param s1
* @param s2
* @return
* @throws JSONException
* @author houkun
* @date 2017/12/6
*/
private double doSimnet(String s1, String s2) {
JSONObject res = aipNlp.simnet(s1, s2, new HashMap<>(0));
double score = 0;
try {
boolean error = res.has("error");
if (error) {
String errorCode = null;
errorCode = res.getString("error");
if (INTERNAL_ERROR.equals(errorCode)) {
score = doSimnet(s1, s2);
} else {
throw new ServiceException(errorCode);
}
} else {
score = res.getDouble("score");
}
return score;
} catch (JSONException e) {
return 0;
}
}
private String getScoreKey(String s1, String s2) {
return SCORE_KEY_PREFIX + s1 + ":" + s2;
}
/**
* 分割文档
......
......@@ -48,14 +48,14 @@ public class BaiduDemo {
// System.out.println(res.toString(2));
//
// // 句法
HashMap<String, Object> option = new HashMap<>();
option.put("mode", 1);
JSONObject des = client.depParser(text, option);
System.out.println(des.toString(2));
JSONObject des1 = client.depParser(text1, option);
System.out.print(des1.toString(2));
JSONObject des2 = client.depParser(text2, option);
System.out.print(des2.toString(2));
// HashMap<String, Object> option = new HashMap<>();
// option.put("mode", 1);
// JSONObject des = client.depParser(text, option);
// System.out.println(des.toString(2));
// JSONObject des1 = client.depParser(text1, option);
// System.out.print(des1.toString(2));
// JSONObject des2 = client.depParser(text2, option);
// System.out.print(des2.toString(2));
//
// // 相似
......@@ -64,9 +64,9 @@ public class BaiduDemo {
// JSONObject response = client.simnet("十九大","中国共产党第十九次全国代表大会", option1);
// System.out.println(response.toString(2));
// // 词向量
// JSONObject vect = client.wordEmbedding("习近平");
// System.out.println(vect.toString(2));
// 词向量
JSONObject vect = client.wordEmbedding("十九大");
System.out.println(vect.toString(2));
//
// // 相似
......
......@@ -16,6 +16,7 @@ package com.dookay.cihai.core;
import com.alibaba.fastjson.JSON;
import com.dookay.cihai.core.aip.AipUtilBean;
import org.apache.commons.lang3.tuple.Pair;
import org.junit.Assert;
import org.junit.Test;
import org.springframework.beans.factory.annotation.Autowired;
......@@ -70,9 +71,16 @@ public class BaiduTest extends CihaiCoreApplicationTests {
File file = resource.getFile();
FileReader reader = new FileReader(file);
String text = FileCopyUtils.copyToString(reader);
List<String> list = aipUtilBean.extractKeyWords(text, 10);
List<String> list = aipUtilBean.extractKeyWords("中国共产党第十九次全国代表大会", text, 15);
System.out.println(JSON.toJSONString(list));
Map<String, Long> map = aipUtilBean.extractNounWordsWithCount(text);
System.out.print(map.toString());
Map<Pair<String, String>, Double> map = aipUtilBean.calcKeywordsRelated("中国共产党第十九次全国代表大会", list);
for (String s : list) {
Map<Pair<String, String>, Double> map1 = aipUtilBean.calcKeywordsRelated(s, list);
System.out.print(map1);
}
System.out.print(map);
// Map<String, Long> map = aipUtilBean.extractNounWordsWithCount(text);
// System.out.print(map.toString());
}
}
......@@ -15,7 +15,13 @@ spring.datasource.druid.filter.config.enabled=true
mapper.mappers=com.dookay.coral.common.core.persistence.Mapper
mybatis.mapper-locations=classpath*:mapper/*.xml
spring.redis.host=192.168.2.27
spring.redis.password=100001
aip.app-id=10486245
aip.api-key=ws8qdxT51xm2qbWufxzRedI3
aip.secret-key=8b6g9ZyR69dFl6aqYdIOGa4IbOGgkdjh
logging.level.com.dookay.cihai.core=debug
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!