blog/assets/js/aa24fd5d.143c547f.js
github-actions[bot] 5eb80b6c5e deploy: 075b4a89ad
2023-11-28 19:27:15 +00:00

1 line
No EOL
18 KiB
JavaScript

"use strict";(self.webpackChunkfi=self.webpackChunkfi||[]).push([[7257],{90251:(e,t,n)=>{n.r(t),n.d(t,{assets:()=>a,contentTitle:()=>h,default:()=>d,frontMatter:()=>r,metadata:()=>l,toc:()=>o});var s=n(85893),i=n(11151);const r={id:"python",slug:"/hash-tables/breaking/python",title:"Breaking Python",description:"Actually getting the worst-case time complexity in Python.\n",tags:["cpp","python","hash-tables"],last_update:{date:new Date("2023-11-28T00:00:00.000Z")}},h="Breaking the Hash Table in Python",l={id:"hash-tables/2023-11-28-breaking/python",title:"Breaking Python",description:"Actually getting the worst-case time complexity in Python.\n",source:"@site/algorithms/12-hash-tables/2023-11-28-breaking/01-python.md",sourceDirName:"12-hash-tables/2023-11-28-breaking",slug:"/hash-tables/breaking/python",permalink:"/algorithms/hash-tables/breaking/python",draft:!1,unlisted:!1,editUrl:"https://github.com/mfocko/blog/tree/main/algorithms/12-hash-tables/2023-11-28-breaking/01-python.md",tags:[{label:"cpp",permalink:"/algorithms/tags/cpp"},{label:"python",permalink:"/algorithms/tags/python"},{label:"hash-tables",permalink:"/algorithms/tags/hash-tables"}],version:"current",lastUpdatedAt:1701129600,formattedLastUpdatedAt:"Nov 28, 2023",sidebarPosition:1,frontMatter:{id:"python",slug:"/hash-tables/breaking/python",title:"Breaking Python",description:"Actually getting the worst-case time complexity in Python.\n",tags:["cpp","python","hash-tables"],last_update:{date:"2023-11-28T00:00:00.000Z"}},sidebar:"autogeneratedBar",previous:{title:"Breaking Hash Table",permalink:"/algorithms/hash-tables/breaking"},next:{title:"Possible Mitigations",permalink:"/algorithms/hash-tables/breaking/mitigations"}},a={},o=[{value:"Preparing the attack",id:"preparing-the-attack",level:2},{value:"Sequences",id:"sequences",level:3},{value:"Results",id:"results",level:2},{value:"Comparing with the tree",id:"comparing-with-the-tree",level:2},{value:"References",id:"references",level:2}];function c(e){const t={a:"a",admonition:"admonition",code:"code",em:"em",h1:"h1",h2:"h2",h3:"h3",hr:"hr",li:"li",ol:"ol",p:"p",pre:"pre",section:"section",strong:"strong",sup:"sup",table:"table",tbody:"tbody",td:"td",th:"th",thead:"thead",tr:"tr",ul:"ul",...(0,i.a)(),...e.components};return(0,s.jsxs)(s.Fragment,{children:[(0,s.jsx)(t.h1,{id:"breaking-the-hash-table-in-python",children:"Breaking the Hash Table in Python"}),"\n",(0,s.jsxs)(t.p,{children:["Our language of choice for bringing the worst out of the hash table is ",(0,s.jsx)(t.em,{children:"Python"}),"."]}),"\n",(0,s.jsxs)(t.p,{children:["Let's start by talking about the hash function and why we've chosen Python for\nthis. Hash function for integers in Python is simply ",(0,s.jsx)(t.em,{children:"identity"}),", as you might've\nguessed, there's no avalanche effect. Another thing that helps us is the fact\nthat integers in Python are technically ",(0,s.jsx)(t.code,{children:"BigInt"}),"s",(0,s.jsx)(t.sup,{children:(0,s.jsx)(t.a,{href:"#user-content-fn-1",id:"user-content-fnref-1","data-footnote-ref":!0,"aria-describedby":"footnote-label",children:"1"})}),". This allows us to put bit\nmore pressure on the hashing function."]}),"\n",(0,s.jsxs)(t.p,{children:["From the perspective of the implementation, it is a hash table that uses probing\nto resolve conflicts. This also means that it's a contiguous space in memory.\nIndexing works like in the provided example above. When the hash table reaches\na ",(0,s.jsx)(t.em,{children:"breaking point"})," (defined somewhere in the C code), it reallocates the table\nand rehashes everything."]}),"\n",(0,s.jsx)(t.admonition,{type:"tip",children:(0,s.jsx)(t.p,{children:"Resizing and rehashing can reduce the conflicts. That is coming from the fact\nthat the position in the table is determined by the hash and the size of the\ntable itself."})}),"\n",(0,s.jsx)(t.h2,{id:"preparing-the-attack",children:"Preparing the attack"}),"\n",(0,s.jsx)(t.p,{children:"Knowing the things above, it is not that hard to construct a method how to cause\nas many conflicts as possible. Let's go over it:"}),"\n",(0,s.jsxs)(t.ol,{children:["\n",(0,s.jsx)(t.li,{children:"We know that integers are hashed to themselves."}),"\n",(0,s.jsx)(t.li,{children:"We also know that from that hash we use only lower bits that are used as\nindices."}),"\n",(0,s.jsx)(t.li,{children:"We also know that there's a rehashing on resize that could possibly fix the\nconflicts."}),"\n"]}),"\n",(0,s.jsx)(t.p,{children:"We will test with different sequences:"}),"\n",(0,s.jsxs)(t.ol,{children:["\n",(0,s.jsx)(t.li,{children:"ordered one, numbers through 1 to N"}),"\n",(0,s.jsx)(t.li,{children:"ordered one in a reversed order, numbers through N back to 1"}),"\n",(0,s.jsx)(t.li,{children:"numbers that are shifted to the left, so they create conflicts until resize"}),"\n",(0,s.jsx)(t.li,{children:"numbers that are shifted to the left, but resizing helps only in the end"}),"\n",(0,s.jsx)(t.li,{children:"numbers that are shifted to the left, but they won't be taken in account even\nafter final resize"}),"\n"]}),"\n",(0,s.jsx)(t.p,{children:"For each of these sequences, we will insert 10\u2077 elements and look each of them\nup for 10 times in a row."}),"\n",(0,s.jsxs)(t.p,{children:["As a base of our benchmark, we will use a ",(0,s.jsx)(t.code,{children:"Strategy"})," class and then for each\nstrategy we will just implement the sequence of numbers that it uses:"]}),"\n",(0,s.jsx)(t.pre,{children:(0,s.jsx)(t.code,{className:"language-py",children:'class Strategy:\n def __init__(self, data_structure=set):\n self._table = data_structure()\n\n @cached_property\n def elements(self):\n raise NotImplementedError("Implement for each strategy")\n\n @property\n def name(self):\n raise NotImplementedError("Implement for each strategy")\n\n def run(self):\n print(f"\\nBenchmarking:\\t\\t{self.name}")\n\n # Extract the elements here, so that the evaluation of them does not\n # slow down the relevant part of benchmark\n elements = self.elements\n\n # Insertion phase\n start = monotonic_ns()\n for x in elements:\n self._table.add(x)\n after_insertion = monotonic_ns()\n\n print(f"Insertion phase:\\t{(after_insertion - start) / 1000000:.2f}ms")\n\n # Lookup phase\n start = monotonic_ns()\n for _ in range(LOOPS):\n for x in elements:\n assert x in self._table\n after_lookups = monotonic_ns()\n\n print(f"Lookup phase:\\t\\t{(after_lookups - start) / 1000000:.2f}ms")\n'})}),"\n",(0,s.jsx)(t.h3,{id:"sequences",children:"Sequences"}),"\n",(0,s.jsx)(t.p,{children:"Let's have a look at how we generate the numbers to be inserted:"}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsxs)(t.li,{children:["ordered sequence (ascending)","\n",(0,s.jsx)(t.pre,{children:(0,s.jsx)(t.code,{className:"language-py",children:"x for x in range(N_ELEMENTS)\n"})}),"\n"]}),"\n",(0,s.jsxs)(t.li,{children:["ordered sequence (descending)","\n",(0,s.jsx)(t.pre,{children:(0,s.jsx)(t.code,{className:"language-py",children:"x for x in reversed(range(N_ELEMENTS))\n"})}),"\n"]}),"\n",(0,s.jsxs)(t.li,{children:["progressive sequence that \u201cheals\u201d on resize","\n",(0,s.jsx)(t.pre,{children:(0,s.jsx)(t.code,{className:"language-py",children:"(x << max(5, x.bit_length())) for x in range(N_ELEMENTS)\n"})}),"\n"]}),"\n",(0,s.jsxs)(t.li,{children:["progressive sequence that \u201cheals\u201d in the end","\n",(0,s.jsx)(t.pre,{children:(0,s.jsx)(t.code,{className:"language-py",children:"(x << max(5, x.bit_length())) for x in reversed(range(N_ELEMENTS))\n"})}),"\n"]}),"\n",(0,s.jsxs)(t.li,{children:["conflicts everywhere","\n",(0,s.jsx)(t.pre,{children:(0,s.jsx)(t.code,{className:"language-py",children:"x << 32 for x in range(N_ELEMENTS)\n"})}),"\n"]}),"\n"]}),"\n",(0,s.jsx)(t.h2,{id:"results",children:"Results"}),"\n",(0,s.jsx)(t.p,{children:"Let's have a look at the obtained results after running the code:"}),"\n",(0,s.jsxs)(t.table,{children:[(0,s.jsx)(t.thead,{children:(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.th,{style:{textAlign:"center"},children:"Technique"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Insertion phase"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Lookup phase"})]})}),(0,s.jsxs)(t.tbody,{children:[(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{style:{textAlign:"center"},children:"ordered sequence (ascending)"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"558.60ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"3304.26ms"})})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{style:{textAlign:"center"},children:"ordered sequence (descending)"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"554.08ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"3365.84ms"})})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{style:{textAlign:"center"},children:"progressive sequence that \u201cheals\u201d on resize"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"3781.30ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"28565.71ms"})})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{style:{textAlign:"center"},children:"progressive sequence that \u201cheals\u201d in the end"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"3280.38ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"26494.61ms"})})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{style:{textAlign:"center"},children:"conflicts everywhere"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"4027.54ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"29132.92ms"})})]})]})]}),"\n",(0,s.jsx)(t.p,{children:"You can see a noticable \u201cjump\u201d in the time after switching to the \u201cprogressive\u201d\nsequence. The last sequence that has conflicts all the time has the worst time,\neven though it's rather comparable with the first progressive sequence with\nregards to the insertion phase."}),"\n",(0,s.jsxs)(t.p,{children:["If we were to compare the ",(0,s.jsx)(t.em,{children:"always conflicting"})," one with the first one, we can\nsee that insertion took over 7\xd7 longer and lookups almost 9\xd7 longer."]}),"\n",(0,s.jsxs)(t.p,{children:["You can have a look at the code ",(0,s.jsx)(t.a,{href:"path:///files/algorithms/hash-tables/breaking/benchmark.py",children:"here"}),"."]}),"\n",(0,s.jsx)(t.h2,{id:"comparing-with-the-tree",children:"Comparing with the tree"}),"\n",(0,s.jsxs)(t.admonition,{type:"danger",children:[(0,s.jsxs)(t.p,{children:["Source code can be found ",(0,s.jsx)(t.a,{href:"path:///files/algorithms/hash-tables/breaking/benchmark.cpp",children:"here"}),"."]}),(0,s.jsx)(t.p,{children:(0,s.jsx)(t.em,{children:"Viewer discretion advised."})})]}),"\n",(0,s.jsx)(t.p,{children:"Python doesn't have a tree structure for sets/maps implemented, therefore for\na comparison we will run a similar benchmark in C++. By running the same\nsequences on both hash table and tree (RB-tree) we will obtain the following\nresults:"}),"\n",(0,s.jsxs)(t.table,{children:[(0,s.jsx)(t.thead,{children:(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.th,{style:{textAlign:"center"},children:"Technique"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Insertion (hash)"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Lookup (hash)"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Insertion (tree)"}),(0,s.jsx)(t.th,{style:{textAlign:"right"},children:"Lookup (tree)"})]})}),(0,s.jsxs)(t.tbody,{children:[(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{style:{textAlign:"center"},children:"ordered (ascending)"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"316ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"298ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"2098ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"5914ms"})})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{style:{textAlign:"center"},children:"ordered (descending)"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"259ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"315ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"1958ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"14747ms"})})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{style:{textAlign:"center"},children:"progressive a)"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"1152ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"6021ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"2581ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"16074ms"})})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{style:{textAlign:"center"},children:"progressive b)"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"1041ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"6096ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"2770ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"15986ms"})})]}),(0,s.jsxs)(t.tr,{children:[(0,s.jsx)(t.td,{style:{textAlign:"center"},children:"conflicts"}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"964ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"1633ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"2559ms"})}),(0,s.jsx)(t.td,{style:{textAlign:"right"},children:(0,s.jsx)(t.code,{children:"13285ms"})})]})]})]}),"\n",(0,s.jsx)(t.admonition,{type:"note",children:(0,s.jsx)(t.p,{children:"We can't forget that implementation details be involved. Hash function is still\nthe identity, to my knowledge."})}),"\n",(0,s.jsx)(t.p,{children:"One interesting thing to notice is the fact that the progressive sequences took\nthe most time in lookups (which is not same as in the Python)."}),"\n",(0,s.jsx)(t.p,{children:"Now, if we have a look at the tree implementation, we can notice two very\ndistinctive things:"}),"\n",(0,s.jsxs)(t.ol,{children:["\n",(0,s.jsxs)(t.li,{children:["Tree implementations are not affected by the input, therefore (except for the\nfirst sequence) we can see ",(0,s.jsx)(t.strong,{children:"very consistent"})," times."]}),"\n",(0,s.jsx)(t.li,{children:"Compared to the hash table the times are much higher and not very ideal."}),"\n"]}),"\n",(0,s.jsx)(t.p,{children:"The reason for the 2nd point may not be very obvious. From the technical\nperspective it makes some sense. Let's dive into it!"}),"\n",(0,s.jsxs)(t.p,{children:["If we take a hash table, it is an array in a memory, therefore it is contiguous\npiece of memory. (For more information I'd suggest looking into the 1st blog\npost below in references section by ",(0,s.jsx)(t.em,{children:"Bjarne Stroustrup"}),")"]}),"\n",(0,s.jsxs)(t.p,{children:["On the other hand, if we take a look at the tree, each node holds some\nattributes and pointers to the left and right descendants of itself. Even if we\nmaintain a reasonable height of the tree (keep the tree balanced), we still need\nto follow the pointers which point to the nodes ",(0,s.jsx)(t.em,{children:"somewhere"})," on the heap. When\ntraversing the tree, we get a consistent time complexity, but at the expense of\njumping between the nodes on the heap which takes some time."]}),"\n",(0,s.jsxs)(t.admonition,{type:"danger",children:[(0,s.jsx)(t.p,{children:"This is not supposed to leverage the hash table and try to persuade people not\nto use the tree representations. There are benefits coming from the respective\ndata structures, even if the time is not the best."}),(0,s.jsx)(t.p,{children:"Overall if we compare the worst-case time complexities of the tree and hash\ntable, tree representation comes off better."})]}),"\n",(0,s.jsx)(t.admonition,{title:"Challenge",type:"tip",children:(0,s.jsx)(t.p,{children:"Try to benchmark with the similar approach in the Rust. Since Rust uses\ndifferent hash function, it would be the best to just override the hash, this\nway you can also avoid the hard part of this attack (making up the numbers that\nwill collide)."})}),"\n",(0,s.jsx)(t.hr,{}),"\n",(0,s.jsx)(t.h2,{id:"references",children:"References"}),"\n",(0,s.jsxs)(t.ol,{children:["\n",(0,s.jsxs)(t.li,{children:["Bjarne Stroustrup.\n",(0,s.jsx)(t.a,{href:"https://www.stroustrup.com/bs_faq.html#list",children:"Are lists evil?"})]}),"\n"]}),"\n",(0,s.jsxs)(t.section,{"data-footnotes":!0,className:"footnotes",children:[(0,s.jsx)(t.h2,{className:"sr-only",id:"footnote-label",children:"Footnotes"}),"\n",(0,s.jsxs)(t.ol,{children:["\n",(0,s.jsxs)(t.li,{id:"user-content-fn-1",children:["\n",(0,s.jsxs)(t.p,{children:["Arbitrary-sized integers, they can get as big as your memory allows. ",(0,s.jsx)(t.a,{href:"#user-content-fnref-1","data-footnote-backref":"","aria-label":"Back to reference 1",className:"data-footnote-backref",children:"\u21a9"})]}),"\n"]}),"\n"]}),"\n"]})]})}function d(e={}){const{wrapper:t}={...(0,i.a)(),...e.components};return t?(0,s.jsx)(t,{...e,children:(0,s.jsx)(c,{...e})}):c(e)}},11151:(e,t,n)=>{n.d(t,{Z:()=>l,a:()=>h});var s=n(67294);const i={},r=s.createContext(i);function h(e){const t=s.useContext(r);return s.useMemo((function(){return"function"==typeof e?e(t):{...t,...e}}),[t,e])}function l(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(i):e.components||i:h(e.components),s.createElement(r.Provider,{value:t},e.children)}}}]);