zyf

ad174ef1 · 张云飞 · 78862a3a · ad174ef1 · ad174ef1 · ad174ef1
Commit ad174ef1 authored Jul 11, 2020 by 张云飞
Hide whitespace changes
Inline Side-by-side

Showing with 167 additions and 18 deletions

ic-name/create-es-heming-index.py
+57 -0

ic-name/create-es-index.py
+6 -6

ic-name/elasticsearch_ic.py
+10 -10

ic-name/tx-ex-heming-insert.py
+94 -0

ic-name/tx-ex-insert.py
+0 -2

No files found.
--- a/ic-name/create-es-heming-index.py
+++ b/ic-name/create-es-heming-index.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2020-07-02 21:46
+# @Author  : zhangyunfei
+# @File    : create-es-index.py
+# @Software: PyCharm
+from elasticsearch import Elasticsearch
+
+
+def create_index():
+    '''
+     创建索引,创建索引名称为tx_ic_bigdata_business_heming_index_01，es7没有类型
+    '''
+    # 创建映射
+    mappings = {
+        "settings": {
+            "number_of_shards": 5,
+            "number_of_replicas": 1
+        },
+        "mappings": {
+            "properties": {
+                "id": {
+                    "type": "integer"
+                },
+                "company_name": {
+                    "type": "text",
+                    "fields": {
+                        "raw": {
+                            "type": "keyword"
+                        }
+                    },
+                    "analyzer": "ik_max_word"
+                },
+                "company_name_standard": {
+                    "type": "text",
+                    "fields": {
+                        "raw": {
+                            "type": "keyword"
+                        }
+                    },
+                    "analyzer": "standard"
+                }
+            }
+        }
+    }
+
+    es = Elasticsearch(['9.223.8.84'], http_auth=("elastic", "Brg2020!"), port=9200)
+
+    if es.indices.exists(index="tx_ic_bigdata_business_heming_index_01"):
+        print("索引已存在，正在删除----", )
+        es.indices.delete(index="tx_ic_bigdata_business_heming_index_01")
+    if es.indices.exists(index="tx_ic_bigdata_business_heming_index_01") is not True:
+        print("创建索引------")
+        res = es.indices.create(index='tx_ic_bigdata_business_heming_index_01', body=mappings)
+        print("成功创建索引------", res)
+
+create_index()
--- a/ic-name/create-es-index.py
+++ b/ic-name/create-es-index.py
@@ -6,7 +6,6 @@
 # @Software: PyCharm
 from elasticsearch import Elasticsearch

-
 def create_index():
    '''
     创建索引,创建索引名称为tx_ic_bigdata_business_index_01，es7没有类型
@@ -80,14 +79,15 @@ def create_index():

    }

-    # es = Elasticsearch(['43.247.184.94'], http_auth=("admines", "adminGSBes."), port=7200)
-    # es = Elasticsearch(['192.168.1.131'])
    es = Elasticsearch(['9.223.8.84'], http_auth=("elastic", "Brg2020!"), port=9200)
-    print("创建索引：")
-    print("索引是否存在：",es.indices.exists(index="tx_ic_bigdata_business_index_01"))
+
+    if es.indices.exists(index="tx_ic_bigdata_business_index_01"):
+        print("索引已存在，正在删除----", )
+        es.indices.delete(index="tx_ic_bigdata_business_index_01")
    if es.indices.exists(index="tx_ic_bigdata_business_index_01") is not True:
+        print("创建索引------")
        res = es.indices.create(index='tx_ic_bigdata_business_index_01', body=mappings)
-        print(res)
+        print("成功创建索引------",res)


 create_index()
--- a/ic-name/elasticsearch_ic.py
+++ b/ic-name/elasticsearch_ic.py
@@ -18,11 +18,11 @@ server.config['JSON_AS_ASCII'] = False


 class Elastic:
-    def __init__(self,ES_HOST,ES_USER,ES_PWD,ES_PORT):
+    def __init__(self, ES_HOST, ES_USER, ES_PWD, ES_PORT):
        self.es = Elasticsearch([ES_HOST], http_auth=(ES_USER, ES_PWD), port=ES_PORT)
-        self.index_name = 'bigdata_ic_gsb_company_04'
-        # self.index_name = 'tx_big_data_business_index_test'
-        self.index_type = '_doc'
+        # self.index_name = 'bigdata_ic_gsb_company_04'
+        self.index_name = 'tx_ic_bigdata_business_heming_index_01'
+        self.index_type = None

    def es_search(self, parameter):
        cityname = parameter['cityname']
@@ -48,8 +48,8 @@ class Elastic:
                            {
                                "query_string": {
                                    "default_field": "company_name",
-                                    "query": "\""+cityname+"\""
-                                 }
+                                    "query": "\"" + cityname + "\""
+                                }
                            },
                            {
                                "query_string": {
@@ -60,7 +60,7 @@ class Elastic:
                            {
                                "bool": {
                                    "should": [
-                                       {
+                                        {
                                            "match": {
                                                "company_name": btname
                                            }
@@ -115,8 +115,9 @@ class Elastic:
                    }
                }
                # querybody["query"]["bool"]["must"]["bool"]["should"].append(conditiontext)
-            print(json.dumps(querybody,ensure_ascii=False))
-            es_result = self.es.search(index=self.index_name, doc_type=self.index_type, body=querybody, size=100, request_timeout=1000)
+            print(json.dumps(querybody, ensure_ascii=False))
+            es_result = self.es.search(index=self.index_name, doc_type=self.index_type, body=querybody, size=100,
+                                       request_timeout=1000)
            items = es_result['hits']['hits']
            result = {
                "Status": "",
@@ -152,4 +153,3 @@ class Elastic:
                "Result": []
            }
            return result
-
--- a/ic-name/tx-ex-heming-insert.py
+++ b/ic-name/tx-ex-heming-insert.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2020-07-02 17:38
+# @Author  : zhangyunfei
+# @File    : tx-ex-insert.py
+# @Software: PyCharm
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2020-06-26 17:16
+# @Author  : zhangyunfei
+# @File    : tx_es_data.py
+# @Software: PyCharm
+
+from elasticsearch import Elasticsearch
+from elasticsearch import helpers
+from elasticsearch.helpers import bulk
+import json
+
+'''
+    读取txt里面的全部数据，插入到新的索引中
+'''
+
+def DetData(datainfo):#根据ES5中返回的结果，组织导入ES6数据结构
+    isTrue = True
+    action = {}
+    try:
+        doc = {}
+        action["_index"] = "tx_ic_bigdata_business_heming_index_01"
+        # action["_type"] = "_doc"   #elasticsearch7.0+版本需要注释
+        if "id" in datainfo.keys():
+            id = datainfo["id"]
+            doc["id"] = id
+            action["_id"] = id
+
+        if "company_name" in datainfo.keys():
+            company_name = datainfo["company_name"]
+            if company_name:
+                doc["company_name"] = company_name
+                doc["company_name_standard"] = company_name
+            else:
+                doc["company_name"] = None
+                doc["company_name_standard"] = None
+        else:
+            doc["company_name"] = None
+
+        action["_source"] = doc
+
+    except Exception as e:
+        isTrue = False
+    return isTrue, action
+
+def main():
+    error = "ES_error.txt"
+    f1 = open(error, "a")
+    index_name = "tx_ic_bigdata_business_heming_index_01"
+    es = Elasticsearch(['9.223.8.84'], http_auth=("elastic", "Brg2020!"), port=9200)
+    readfile = open('2.txt','r',encoding='utf-8')
+    lines = readfile.readlines()   # 获取全部
+    bulkList = []
+    for line in lines:
+        line = line.strip()
+        ll = json.loads(line)
+        isTrue, action = DetData(ll)
+        if isTrue:
+            bulkList.append(action)
+        else:
+            f1.write("获取" + str(ll) + "的数据出错！\r\n")
+        if len(bulkList) == 500:
+            print('插入500...............')
+            try:
+                bulk(es, bulkList, index=index_name, raise_on_error=True)  # 批量插入
+            except Exception as e:
+                try:
+                    for er in e.args[1]:  # 和上面的for是一样的，都能取到错误信息
+                        f1.write("插入" + str(er["index"]["data"]) + "的数据出错！\r\n")
+                except:
+                    f1.write("插入" + str(bulkList) + "的数据出错！\r\n")
+            bulkList = []
+    if len(bulkList) > 0:
+        try:
+            bulk(es, bulkList, index=index_name, raise_on_error=True)  # 批量插入
+        except Exception as e:
+            try:
+                for er in e.args[1]:  # 和上面的for是一样的，都能取到错误信息
+                    f1.write("插入" + str(er["index"]["data"]) + "的数据出错！\r\n")
+            except:
+                f1.write("插入" + str(bulkList) + "的数据出错！\r\n")
+        bulkList = []
+    f1.flush()
+    f1.close()
+    print("over")
+
+main()
\ No newline at end of file
--- a/ic-name/tx-ex-insert.py
+++ b/ic-name/tx-ex-insert.py
@@ -144,8 +144,6 @@ def main():
    f1 = open(error, "a")
    index_name = "tx_ic_bigdata_business_index_01"
    es = Elasticsearch(['9.223.8.84'], http_auth=("elastic", "Brg2020!"), port=9200)
-    # es = Elasticsearch(['43.247.184.94'], http_auth=("admines", "adminGSBes."), port=7200)
-    # es = Elasticsearch(['192.168.1.131'])
    readfile = open('2.txt','r',encoding='utf-8')
    lines = readfile.readlines()   # 获取全部
    bulkList = []